From 4aa2f2cdde0575c64725c6eb56e5075a613ae612 Mon Sep 17 00:00:00 2001
From: Abhinav Garg <abhinavg@stanford.edu>
Date: Thu, 18 Jul 2024 01:31:30 -0700
Subject: [PATCH 01/94] Adding configs related to DCLM

---
 config/data/dclm_gpt_neo.yaml  | 74 ++++++++++++++++++++++++++++++++++
 config/llama_7b_with_dclm.yaml | 29 +++++++++++++
 2 files changed, 103 insertions(+)
 create mode 100644 config/data/dclm_gpt_neo.yaml
 create mode 100644 config/llama_7b_with_dclm.yaml

diff --git a/config/data/dclm_gpt_neo.yaml b/config/data/dclm_gpt_neo.yaml
new file mode 100644
index 000000000..3bdd3f521
--- /dev/null
+++ b/config/data/dclm_gpt_neo.yaml
@@ -0,0 +1,74 @@
+cache_dir: "gs://marin-data/tokenized/dclm/gpt_neo_tokenizer"
+tokenizer: "EleutherAI/gpt-neox-20b"
+stop_strategy: restart
+configs:
+  "dclm":
+    train_urls:
+      - gs://marin-data/datacomp/dclm-baseline-dedup-07-09/*/*/*.jsonl.zstd
+  # these are just for eval
+  "paloma/4chan":
+    validation_urls:
+      - gs://levanter-data/paloma/4chan_meta_sep/val/val*.jsonl.gz
+  "paloma/c4_100_domains":
+    validation_urls:
+      - gs://levanter-data/paloma/c4_100_domains/val/val*.jsonl.gz
+  "paloma/c4_en":
+    validation_urls:
+      - gs://levanter-data/paloma/c4_en/val/val*.jsonl.gz
+  "paloma/dolma-v1_5":
+    validation_urls:
+      - gs://levanter-data/paloma/dolma-v1_5/val/val*.jsonl.gz
+  "paloma/dolma_100_programing_languages":
+    validation_urls:
+      - gs://levanter-data/paloma/dolma_100_programing_languages/val/val*.jsonl.gz
+  "paloma/dolma_100_subreddits":
+    validation_urls:
+      - gs://levanter-data/paloma/dolma_100_subreddits/val/val*.jsonl.gz
+  "paloma/falcon-refinedweb":
+    validation_urls:
+      - gs://levanter-data/paloma/falcon-refinedweb/val/val*.jsonl.gz
+  "paloma/gab":
+    validation_urls:
+      - gs://levanter-data/paloma/gab/val/val*.jsonl.gz
+  "paloma/m2d2_s2orc_unsplit":
+    validation_urls:
+      - gs://levanter-data/paloma/m2d2_s2orc_unsplit/val/val*.jsonl.gz
+  "paloma/m2d2_wikipedia_unsplit":
+    validation_urls:
+      - gs://levanter-data/paloma/m2d2_wikipedia_unsplit/val/val*.jsonl.gz
+  "paloma/manosphere_meta_sep":
+    validation_urls:
+      - gs://levanter-data/paloma/manosphere_meta_sep/val/val*.jsonl.gz
+  "paloma/mc4":
+    validation_urls:
+      - gs://levanter-data/paloma/mc4/val/val*.jsonl.gz
+  "paloma/ptb":
+    validation_urls:
+      - gs://levanter-data/paloma/ptb/val/val*.jsonl.gz
+  "paloma/redpajama":
+    validation_urls:
+      - gs://levanter-data/paloma/redpajama/val/val*.jsonl.gz
+  "paloma/twitterAAE_HELM_fixed":
+    validation_urls:
+      - gs://levanter-data/paloma/twitterAAE_HELM_fixed/val/val*.jsonl.gz
+  "paloma/wikitext_103":
+    validation_urls:
+      - gs://levanter-data/paloma/wikitext_103/val/val*.jsonl.gz
+train_weights:
+  dclm: 1.0
+  paloma/4chan: 0.0
+  paloma/c4_100_domains: 0.0
+  paloma/c4_en: 0.0
+  paloma/dolma-v1_5: 0.0
+  paloma/dolma_100_programing_languages: 0.0
+  paloma/dolma_100_subreddits: 0.0
+  paloma/falcon-refinedweb: 0.0
+  paloma/gab: 0.0
+  paloma/m2d2_s2orc_unsplit: 0.0
+  paloma/m2d2_wikipedia_unsplit: 0.0
+  paloma/manosphere_meta_sep: 0.0
+  paloma/mc4: 0.0
+  paloma/ptb: 0.0
+  paloma/redpajama: 0.0
+  paloma/twitterAAE_HELM_fixed: 0.0
+  paloma/wikitext_103: 0.0
\ No newline at end of file
diff --git a/config/llama_7b_with_dclm.yaml b/config/llama_7b_with_dclm.yaml
new file mode 100644
index 000000000..6a35d3706
--- /dev/null
+++ b/config/llama_7b_with_dclm.yaml
@@ -0,0 +1,29 @@
+data: !include data/dclm_gpt_neo.yaml
+model:  # 7B class model
+  type: llama
+  seq_len: 2048
+  hidden_dim: 4096
+  intermediate_dim: 11008
+  num_layers: 32
+  num_heads: 32
+  num_kv_heads: 32
+  use_flash_attention: True
+  flash_attention_block_size: 1024
+trainer:
+  tracker:
+    type: wandb
+    project: "marin"
+    tags: ["dclm", "7B", "llama"]
+
+  mp: p=f32,c=bfloat16
+  train_batch_size: 2048
+  num_train_steps: 750000  # 3,000,000,000,000 / 4,000,000 = 750,000
+  steps_per_eval: 1000
+  tensor_parallel_axes: ["mlp", "heads"]
+  fsdp_axis: "embed"
+  batch_axis: "batch"
+optimizer:
+  learning_rate: 4E-4
+  weight_decay: 0.1
+  min_lr_ratio: 0.1
+  warmup: 0.01
\ No newline at end of file

From dde9ed0b2557abbc3a74012d865322c779a7155d Mon Sep 17 00:00:00 2001
From: Abhinav Garg <abhinavg@stanford.edu>
Date: Thu, 18 Jul 2024 17:14:41 -0700
Subject: [PATCH 02/94] Adding configs related to DCLM

---
 config/llama_1b_dclm.yaml | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 config/llama_1b_dclm.yaml

diff --git a/config/llama_1b_dclm.yaml b/config/llama_1b_dclm.yaml
new file mode 100644
index 000000000..85c2c59df
--- /dev/null
+++ b/config/llama_1b_dclm.yaml
@@ -0,0 +1,30 @@
+data: !include data/dclm_gpt_neo.yaml
+model:  # 1B class model
+  type: llama
+  seq_len: 2048
+  hidden_dim: 2048
+  intermediate_dim: 8192
+  num_layers: 24
+  num_heads: 16
+  num_kv_heads: 16
+  use_flash_attention: True
+  flash_attention_block_size: 1024
+trainer:
+  tracker:
+    type: wandb
+    project: "marin"
+    tags: ["llama", "fineweb", "markdown"]
+
+  mp: p=f32,c=bfloat16
+  train_batch_size: 256  # 2048 * 2048 = 4,194,304
+  num_train_steps: 71526  # 300,000,000,000 / 4,194,304 = 71,526
+  steps_per_eval: 1000
+  tensor_parallel_axes: ["mlp", "heads"]
+  fsdp_axis: "embed"
+  batch_axis: "batch"
+optimizer:
+  learning_rate: 3E-3
+  weight_decay: 0.033
+  min_lr_ratio: 0.1
+  warmup: 5000
+  cooldown: 3E-5

From b991e29aee336c4f35447f9635b03e825e5a77c1 Mon Sep 17 00:00:00 2001
From: Abhinav Garg <abhinavg@stanford.edu>
Date: Fri, 19 Jul 2024 03:14:52 -0700
Subject: [PATCH 03/94] Adding Z loss

---
 src/levanter/models/llama.py    |  4 ++++
 src/levanter/models/lm_model.py | 11 +++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/levanter/models/llama.py b/src/levanter/models/llama.py
index 2a2d2664d..bd2ad7e05 100644
--- a/src/levanter/models/llama.py
+++ b/src/levanter/models/llama.py
@@ -53,6 +53,7 @@ class LlamaConfig(HFCompatConfig):
             Note that num_heads must be divisible by this number. Defaults to 32.
         activation_function (str, optional): activation function for the hidden layer. Defaults to "silu".
         rope_scaling (Dict, optional): dict containing the scaling configuration for the Rotary Positional Embedding.
+        z_loss_weight (float, optional): weight for the z-loss. Defaults to 0.0, no z-loss.
     """
 
     seq_len: int = 2048
@@ -64,6 +65,7 @@ class LlamaConfig(HFCompatConfig):
     activation_function: str = "silu"
     initializer_range: float = 0.02
     layer_norm_epsilon: float = 1e-5
+    z_loss_weight: float = 0.0
 
     # Attention-related config
     upcast_attn: bool = False
@@ -121,6 +123,7 @@ def from_hf_config(cls, hf_config: HfConfig):
             layer_norm_epsilon=hf_config.rms_norm_eps,
             rope_scaling=hf_config.rope_scaling,
             rope_theta=hf_config.rope_theta,
+            z_loss_weight=0.0, # z loss is not present in HF config
         )
 
     def to_hf_config(self, vocab_size: int, config_overrides: Optional[Dict] = None) -> HfLlamaConfig:
@@ -556,6 +559,7 @@ def __call__(
         lm_logits = self.lm_head(x, key=k_head)
         return lm_logits
 
+
     def resize_vocab(self, new_size: int, key=None) -> "LmHeadModel[LlamaConfig]":
         new_Vocab = self.Vocab.resize(new_size)
         k1, k2 = maybe_rng_split(key, 2)
diff --git a/src/levanter/models/lm_model.py b/src/levanter/models/lm_model.py
index 543c6a5ca..f83a9999d 100644
--- a/src/levanter/models/lm_model.py
+++ b/src/levanter/models/lm_model.py
@@ -11,7 +11,7 @@
 from haliax.nn import cross_entropy_loss
 
 from levanter.models.attention import AttentionMask
-
+from levanter.models.loss import cross_entropy_and_logsumexp_penalty
 
 LmConfigT = TypeVar("LmConfigT", bound="LmConfig")
 LmT = TypeVar("LmT", bound="LmHeadModel")
@@ -131,9 +131,12 @@ def compute_loss(
         logits = logits.astype(jnp.float32)
         targets = hax.roll(example.tokens, -1, axis=self.Pos.name)
         target_y = hax.nn.one_hot(targets, self.Vocab, dtype=logits.dtype)
-        loss = cross_entropy_loss(
-            logits, self.Vocab, target_y, reduction, reduction_axis=reduction_axis, where=example.loss_mask
-        )
+        if hasattr(self.config, "z_loss_weight") and self.config.z_loss_weight > 0:
+            loss = cross_entropy_and_logsumexp_penalty(logits, self.Vocab, target_y, logsumexp_weight=self.config.z_loss_weight)
+        else:
+            loss = cross_entropy_loss(
+                logits, self.Vocab, target_y, reduction, reduction_axis=reduction_axis, where=example.loss_mask
+            )
 
         return loss
 

From bb674bbc70f3b59dbf01f8f503b3e8a8ded552dd Mon Sep 17 00:00:00 2001
From: Abhinav Garg <abhinavg@stanford.edu>
Date: Fri, 19 Jul 2024 03:23:03 -0700
Subject: [PATCH 04/94] pre commit changes

---
 config/data/dclm_gpt_neo.yaml  | 2 +-
 config/llama_7b_with_dclm.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/config/data/dclm_gpt_neo.yaml b/config/data/dclm_gpt_neo.yaml
index 3bdd3f521..36dbf69e6 100644
--- a/config/data/dclm_gpt_neo.yaml
+++ b/config/data/dclm_gpt_neo.yaml
@@ -71,4 +71,4 @@ train_weights:
   paloma/ptb: 0.0
   paloma/redpajama: 0.0
   paloma/twitterAAE_HELM_fixed: 0.0
-  paloma/wikitext_103: 0.0
\ No newline at end of file
+  paloma/wikitext_103: 0.0
diff --git a/config/llama_7b_with_dclm.yaml b/config/llama_7b_with_dclm.yaml
index 6a35d3706..9703fc2d5 100644
--- a/config/llama_7b_with_dclm.yaml
+++ b/config/llama_7b_with_dclm.yaml
@@ -26,4 +26,4 @@ optimizer:
   learning_rate: 4E-4
   weight_decay: 0.1
   min_lr_ratio: 0.1
-  warmup: 0.01
\ No newline at end of file
+  warmup: 0.01

From 6c99dfb84ed4a16a81c9967c87b18baf8edd7021 Mon Sep 17 00:00:00 2001
From: Abhinav Garg <abhinavg@stanford.edu>
Date: Fri, 19 Jul 2024 12:25:47 -0700
Subject: [PATCH 05/94] Adding z_loss as part of train_lm.py

---
 src/levanter/main/train_lm.py   | 29 +++++++++++++++++++++++++++--
 src/levanter/models/llama.py    |  3 +--
 src/levanter/models/lm_model.py | 32 +++++++++++++++++++++++++++++++-
 3 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/src/levanter/main/train_lm.py b/src/levanter/main/train_lm.py
index 00099c86f..1b5075eec 100644
--- a/src/levanter/main/train_lm.py
+++ b/src/levanter/main/train_lm.py
@@ -8,7 +8,7 @@
 import jax.random as jrandom
 
 import haliax as hax
-from haliax import Axis
+from haliax import Axis, Scalar
 from haliax.partitioning import named_jit, round_axis_for_partitioning
 
 import levanter
@@ -19,12 +19,29 @@
 from levanter.models.lm_model import LmConfig
 from levanter.optim import AdamConfig, OptimizerConfig
 from levanter.trainer import Trainer, TrainerConfig
+from levanter.types import ComputeLossFunction, M, X
 from levanter.utils.jax_utils import parameter_count
 
 
 logger = logging.getLogger(__name__)
 
 
+class ModuleComputeZLoss(ComputeLossFunction[M, X]):
+    """
+    Loss that just delegates to the model's compute_z_loss method.
+    """
+
+    def __call__(
+        self,
+        model,
+        *inputs: X,
+        reduction: Optional[hax.ReductionFunction] = hax.mean,
+        reduction_axis: Optional[hax.AxisSelection] = None,
+        **kwargs,
+    ) -> Scalar | hax.NamedArray:
+        return model.compute_z_loss(*inputs, reduction=reduction, reduction_axis=reduction_axis, **kwargs)
+
+
 @dataclass
 class TrainLmConfig:
     data: Union[LMDatasetConfig, LMMixtureDatasetConfig] = field(default_factory=LMDatasetConfig)
@@ -48,6 +65,7 @@ class TrainLmConfig:
 
     update_hessian_steps: int = 10
     data_seed: Optional[int] = None  # if provided, will override the data seed from the trainer
+    z_loss_weight: float = 0.0
 
 
 def main(config: TrainLmConfig):
@@ -82,11 +100,18 @@ def main(config: TrainLmConfig):
     levanter.initialize(config)
     optimizer = config.optimizer.build(config.trainer.num_train_steps)
 
+    loss_fn: Optional[ComputeLossFunction] = None
+
+    if config.z_loss_weight > 0:
+        loss_fn = ModuleComputeZLoss()
+    else:
+        loss_fn = None  # It will be automatically set to the default loss function in the model
+
     # Using the trainer as a context manager does 3 things:
     # 1. Sets the device mesh
     # 2. Sets the axis mapping (for fsdp)
     # 3. Sets the global metrics tracker
-    with Trainer(config.trainer, optimizer) as trainer:
+    with Trainer(config.trainer, optimizer, loss_fn) as trainer:
         # randomness in jax is tightly controlled by "keys" which are the states of the random number generators
         # this makes deterministic training pretty easy
         seed = config.trainer.seed
diff --git a/src/levanter/models/llama.py b/src/levanter/models/llama.py
index bd2ad7e05..724826321 100644
--- a/src/levanter/models/llama.py
+++ b/src/levanter/models/llama.py
@@ -123,7 +123,7 @@ def from_hf_config(cls, hf_config: HfConfig):
             layer_norm_epsilon=hf_config.rms_norm_eps,
             rope_scaling=hf_config.rope_scaling,
             rope_theta=hf_config.rope_theta,
-            z_loss_weight=0.0, # z loss is not present in HF config
+            z_loss_weight=0.0,  # z loss is not present in HF config
         )
 
     def to_hf_config(self, vocab_size: int, config_overrides: Optional[Dict] = None) -> HfLlamaConfig:
@@ -559,7 +559,6 @@ def __call__(
         lm_logits = self.lm_head(x, key=k_head)
         return lm_logits
 
-
     def resize_vocab(self, new_size: int, key=None) -> "LmHeadModel[LlamaConfig]":
         new_Vocab = self.Vocab.resize(new_size)
         k1, k2 = maybe_rng_split(key, 2)
diff --git a/src/levanter/models/lm_model.py b/src/levanter/models/lm_model.py
index f83a9999d..dcda291da 100644
--- a/src/levanter/models/lm_model.py
+++ b/src/levanter/models/lm_model.py
@@ -9,10 +9,12 @@
 import haliax as hax
 from haliax import Axis, NamedArray
 from haliax.nn import cross_entropy_loss
+from haliax.nn.loss import maybe_reduce_loss
 
 from levanter.models.attention import AttentionMask
 from levanter.models.loss import cross_entropy_and_logsumexp_penalty
 
+
 LmConfigT = TypeVar("LmConfigT", bound="LmConfig")
 LmT = TypeVar("LmT", bound="LmHeadModel")
 
@@ -132,7 +134,9 @@ def compute_loss(
         targets = hax.roll(example.tokens, -1, axis=self.Pos.name)
         target_y = hax.nn.one_hot(targets, self.Vocab, dtype=logits.dtype)
         if hasattr(self.config, "z_loss_weight") and self.config.z_loss_weight > 0:
-            loss = cross_entropy_and_logsumexp_penalty(logits, self.Vocab, target_y, logsumexp_weight=self.config.z_loss_weight)
+            loss = cross_entropy_and_logsumexp_penalty(
+                logits, self.Vocab, target_y, logsumexp_weight=self.config.z_loss_weight
+            )
         else:
             loss = cross_entropy_loss(
                 logits, self.Vocab, target_y, reduction, reduction_axis=reduction_axis, where=example.loss_mask
@@ -140,6 +144,32 @@ def compute_loss(
 
         return loss
 
+    def compute_z_loss(
+        self,
+        example: LmExample,
+        z_loss_weight,
+        *,
+        key=None,
+        reduction: Optional[hax.ReductionFunction] = hax.mean,
+        reduction_axis: Optional[hax.AxisSelection] = None,
+    ) -> jnp.ndarray | NamedArray:
+        """
+        Computes the cross-entropy loss for a language modeling example with z_loss.
+        If reduction is not None, the loss is reduced
+        across the reduction axis (with reduction_axis=None meaning all axes). If reduction is None, the loss is not
+        reduced, and the result is a named array with axes (*batch axes, sequence_length).
+        """
+        logits = self(example.tokens, example.attn_mask, key=key)
+        # TODO: would be nice if we made the dtype configurable
+        logits = logits.astype(jnp.float32)
+        targets = hax.roll(example.tokens, -1, axis=self.Pos.name)
+        target_y = hax.nn.one_hot(targets, self.Vocab, dtype=logits.dtype)
+        loss = cross_entropy_and_logsumexp_penalty(
+            logits, self.Vocab, target_y, logsumexp_weight=self.config.z_loss_weight
+        )
+        loss = maybe_reduce_loss(loss, reduction=reduction, reduction_axis=reduction_axis, where=example.loss_mask)
+        return loss
+
     @property
     def vocab_size(self) -> int:
         return self.Vocab.size

From 24469e786bf599ba8cf86135d707dca6ee5dc0fb Mon Sep 17 00:00:00 2001
From: Abhinav Garg <abhinavg@stanford.edu>
Date: Fri, 19 Jul 2024 12:29:55 -0700
Subject: [PATCH 06/94] Reverting changes to llama.py for z_loss

---
 src/levanter/models/llama.py    |  3 ---
 src/levanter/models/lm_model.py | 11 +++--------
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/src/levanter/models/llama.py b/src/levanter/models/llama.py
index 724826321..2a2d2664d 100644
--- a/src/levanter/models/llama.py
+++ b/src/levanter/models/llama.py
@@ -53,7 +53,6 @@ class LlamaConfig(HFCompatConfig):
             Note that num_heads must be divisible by this number. Defaults to 32.
         activation_function (str, optional): activation function for the hidden layer. Defaults to "silu".
         rope_scaling (Dict, optional): dict containing the scaling configuration for the Rotary Positional Embedding.
-        z_loss_weight (float, optional): weight for the z-loss. Defaults to 0.0, no z-loss.
     """
 
     seq_len: int = 2048
@@ -65,7 +64,6 @@ class LlamaConfig(HFCompatConfig):
     activation_function: str = "silu"
     initializer_range: float = 0.02
     layer_norm_epsilon: float = 1e-5
-    z_loss_weight: float = 0.0
 
     # Attention-related config
     upcast_attn: bool = False
@@ -123,7 +121,6 @@ def from_hf_config(cls, hf_config: HfConfig):
             layer_norm_epsilon=hf_config.rms_norm_eps,
             rope_scaling=hf_config.rope_scaling,
             rope_theta=hf_config.rope_theta,
-            z_loss_weight=0.0,  # z loss is not present in HF config
         )
 
     def to_hf_config(self, vocab_size: int, config_overrides: Optional[Dict] = None) -> HfLlamaConfig:
diff --git a/src/levanter/models/lm_model.py b/src/levanter/models/lm_model.py
index dcda291da..e0f4b9f2d 100644
--- a/src/levanter/models/lm_model.py
+++ b/src/levanter/models/lm_model.py
@@ -133,14 +133,9 @@ def compute_loss(
         logits = logits.astype(jnp.float32)
         targets = hax.roll(example.tokens, -1, axis=self.Pos.name)
         target_y = hax.nn.one_hot(targets, self.Vocab, dtype=logits.dtype)
-        if hasattr(self.config, "z_loss_weight") and self.config.z_loss_weight > 0:
-            loss = cross_entropy_and_logsumexp_penalty(
-                logits, self.Vocab, target_y, logsumexp_weight=self.config.z_loss_weight
-            )
-        else:
-            loss = cross_entropy_loss(
-                logits, self.Vocab, target_y, reduction, reduction_axis=reduction_axis, where=example.loss_mask
-            )
+        loss = cross_entropy_loss(
+            logits, self.Vocab, target_y, reduction, reduction_axis=reduction_axis, where=example.loss_mask
+        )
 
         return loss
 

From 76092c4b1d3af6df6243e6df1125c4d036c545a8 Mon Sep 17 00:00:00 2001
From: Ivan Zhou <ivan.zhouyq@gmail.com>
Date: Sun, 21 Jul 2024 09:55:16 -0700
Subject: [PATCH 07/94] Address capacity_type and env variables (#665)

---
 docs/Getting-Started-TPU-VM.md | 2 +-
 infra/launch.py                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/Getting-Started-TPU-VM.md b/docs/Getting-Started-TPU-VM.md
index dcdeba02f..f13e98541 100644
--- a/docs/Getting-Started-TPU-VM.md
+++ b/docs/Getting-Started-TPU-VM.md
@@ -108,7 +108,7 @@ zone: us-west4-a
 tpu_name: test-spin-up-32
 tpu_type: "v5litepod-16"
 vm_image: "tpu-ubuntu2204-base"
-preemptible: true
+capacity_type: "preemptible"
 autodelete: false
 subnetwork: "default"
 
diff --git a/infra/launch.py b/infra/launch.py
index 631193905..69399e585 100755
--- a/infra/launch.py
+++ b/infra/launch.py
@@ -158,7 +158,7 @@ def _default_run_id():
     cli.add_arg(parser, config, ["--github_token"], type=str)
 
     parser.add_argument(
-        "-e", "--env", action="append", nargs=2, metavar=("KEY", "VALUE"), default=config.get("env", {}).items()
+        "-e", "--env", action="append", nargs=2, metavar=("KEY", "VALUE"), default=list(config.get("env", {}).items())
     )
     parser.add_argument("command", nargs=argparse.REMAINDER)
 

From 2e558569d9fca60c3ad0e92f184f274116bf290d Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Wed, 24 Jul 2024 14:29:48 -0700
Subject: [PATCH 08/94] fix best effort test (#662)

---
 pyproject.toml                  | 2 +-
 src/levanter/utils/jax_utils.py | 3 ++-
 tests/test_jax_utils.py         | 2 ++
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b0d766429..489219016 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ dependencies = [
 #    "haliax>=1.3,<2.0",
 #     Haliax changes in step with levanter, so we'll just use the git version except for releases.
 #    "haliax @ git+https://github.com/stanford-crfm/haliax.git@main",
-    "haliax>=1.4.dev301",
+    "haliax>=1.4.dev307",
     "equinox>=0.11.4",
     "jaxtyping>=0.2.20",
     "tokenizers>=0.15.2",
diff --git a/src/levanter/utils/jax_utils.py b/src/levanter/utils/jax_utils.py
index df22c3810..d159d7948 100644
--- a/src/levanter/utils/jax_utils.py
+++ b/src/levanter/utils/jax_utils.py
@@ -266,7 +266,8 @@ def best_effort_sharding(shape, *, devices=None, mesh=None):
             gcd = np.gcd(shape_i, num_devices)
             num_devices //= gcd
             device_shape = (num_devices, gcd) + device_shape[1:]
-        sharding = PositionalSharding(devices).reshape(list(device_shape)).replicate(axis=0, keepdims=True)
+        sharding = PositionalSharding(devices).reshape(list(device_shape))
+        sharding = sharding.replicate(axis=0, keepdims=False)
         return sharding
     else:
         # get the existing mesh and find the FSDP axis
diff --git a/tests/test_jax_utils.py b/tests/test_jax_utils.py
index 70c3d3588..c768e3661 100644
--- a/tests/test_jax_utils.py
+++ b/tests/test_jax_utils.py
@@ -16,6 +16,8 @@ def _assert_can_put_with_sharding(array, sharding):
 
 @skip_if_not_enough_devices(8)
 def test_best_effort_sharding():
+    if len(jax.devices()) % 8 != 0:
+        pytest.skip("Not enough devices")
     # 1D array, 8 devices
     array = np.arange(8)
     sharding = best_effort_sharding(array.shape)

From 2e64f14d01a1a191a63df0a3e480397ed24808ff Mon Sep 17 00:00:00 2001
From: Jason Wang <blahblahj.wsy@gmail.com>
Date: Thu, 25 Jul 2024 17:53:20 -0700
Subject: [PATCH 09/94] Enable multislice in launch script (#666)

* refactor queued-resources

* fix multislice

* add auto tear down

* reuse docker image

* tiny fix

* switch to concurrent executor for parallel subprocesses & small fix & logs
---
 infra/helpers/cli.py |  35 +++++++-
 infra/launch.py      | 195 ++++++++++++++++++++++++++++---------------
 2 files changed, 163 insertions(+), 67 deletions(-)

diff --git a/infra/helpers/cli.py b/infra/helpers/cli.py
index dbc477d95..d065c6a1d 100644
--- a/infra/helpers/cli.py
+++ b/infra/helpers/cli.py
@@ -1,4 +1,5 @@
 import argparse
+import concurrent.futures
 import os
 import subprocess
 import typing
@@ -23,9 +24,12 @@ def add_ssh_key(ssh_key_filename):
     subprocess.check_call(["ssh-add", ssh_key_filename])
 
 
-def tpu_ssh(tpu_name, zone, *args, ignore_failure=False):
+def tpu_ssh(tpu_name, zone, node_count, *args, ignore_failure=False):
     add_ssh_key(os.path.expanduser("~/.ssh/google_compute_engine"))
     try:
+        if node_count > 1:
+            return _tpu_ssh_multislice(tpu_name, zone, node_count, *args, ignore_failure=ignore_failure)
+
         return run_command(
             "gcloud",
             "alpha",
@@ -45,6 +49,35 @@ def tpu_ssh(tpu_name, zone, *args, ignore_failure=False):
             raise
 
 
+def _tpu_ssh_multislice(tpu_name, zone, node_count, *args, ignore_failure=False):
+    with concurrent.futures.ProcessPoolExecutor() as executor:
+        futures = [
+            executor.submit(
+                run_command,
+                "gcloud",
+                "alpha",
+                "compute",
+                "tpus",
+                "tpu-vm",
+                "ssh",
+                f"{tpu_name}-{i}",
+                "--worker=all",
+                f"--zone={zone}",
+                "--command=%s" % " ".join(args),
+            )
+            for i in range(node_count)
+        ]
+
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                future.result()
+            except subprocess.CalledProcessError as e:
+                if ignore_failure:
+                    print("Ignoring failure:", e)
+                else:
+                    raise
+
+
 # Oddly enough, there's no API to simply fetch the current gcloud configuration...
 def gcloud_config():
     client = storage.Client()
diff --git a/infra/launch.py b/infra/launch.py
index 69399e585..ac3b3c521 100755
--- a/infra/launch.py
+++ b/infra/launch.py
@@ -3,6 +3,7 @@
 import argparse
 import base64
 import getpass
+import json
 import os
 import subprocess
 import time
@@ -11,11 +12,12 @@
 from infra.helpers import cli
 
 
-def setup_vm_docker(tpu_name, zone, docker_base_image):
+def setup_vm_docker(tpu_name, zone, node_count, docker_base_image):
     """Change docker permissions on `tpu_name`, remove any old runs, and setup the cache volume."""
     cli.tpu_ssh(
         tpu_name,
         zone,
+        node_count,
         "sudo",
         "usermod",
         "-aG",
@@ -38,44 +40,65 @@ def setup_vm_docker(tpu_name, zone, docker_base_image):
 
 
 def list_tpus(zone):
-    tpus = subprocess.check_output(
-        [
-            "gcloud",
-            "alpha",
-            "compute",
-            "tpus",
-            "tpu-vm",
-            "list",
-            "--zone=" + zone,
-        ]
+    return json.loads(
+        subprocess.check_output(
+            [
+                "gcloud",
+                "alpha",
+                "compute",
+                "tpus",
+                "queued-resources",
+                "list",
+                f"--zone={zone}",
+                "--format=json(name.basename(), state)",
+            ]
+        )
     )
-    rows = tpus.decode("utf-8").split("\n")
-    header = rows[0].split()
-    tpus = []
-    for row in rows[1:]:
-        if row:
-            tpus.append(dict(zip(header, row.split())))
-    return tpus
-
-
-def start_tpu_vm(tpu_name, *, tpu_type, capacity_type, version, zone, autodelete):
-    tpu_exists = any([tpu["NAME"] == tpu_name for tpu in list_tpus(zone)])
-    if tpu_exists:
-        if not autodelete:
+
+
+def describe_tpu(tpu_name, zone):
+    try:
+        return json.loads(
+            subprocess.check_output(
+                [
+                    "gcloud",
+                    "alpha",
+                    "compute",
+                    "tpus",
+                    "queued-resources",
+                    "describe",
+                    tpu_name,
+                    f"--zone={zone}",
+                    "--format=json(name.basename(), state)",
+                ]
+            )
+        )
+    except subprocess.CalledProcessError:
+        return None
+
+
+def start_tpu_vm(tpu_name, *, tpu_type, capacity_type, version, zone, autodelete, node_count):
+    tpu_stat = describe_tpu(tpu_name, zone)
+    if tpu_stat is not None:
+        if tpu_stat["state"]["state"] in ["FAILED", "SUSPENDED"]:
+            print("TPU suspended, bypassing autodelete config and deleting...")
+        elif not autodelete:
             print("TPU already exists and autodelete is false, leaving it as is.")
             return
+        else:
+            print("TPU already exists, deleting...")
 
-        print("TPU already exists, deleting...")
         cli.run_command(
             "gcloud",
             "alpha",
             "compute",
             "tpus",
-            "tpu-vm",
+            "queued-resources",
             "delete",
+            tpu_name,
             "--quiet",
             f"--zone={zone}",
-            tpu_name,
+            "--force",
         )
 
     print(f"Creating new TPU {tpu_name} in {zone} of type {tpu_type}...")
@@ -84,16 +107,16 @@ def start_tpu_vm(tpu_name, *, tpu_type, capacity_type, version, zone, autodelete
         "alpha",
         "compute",
         "tpus",
-        "tpu-vm",
+        "queued-resources",
         "create",
         tpu_name,
         f"--accelerator-type={tpu_type}",
-        f"--version={version}",
-        "--zone=" + zone,
+        f"--runtime-version={version}",
+        f"--zone={zone}",
         "--quiet",
     ]
-    if capacity_type == "preemptible":
-        command.append("--preemptible")
+    if capacity_type in ["preemptible", "best-effort"]:
+        command.append("--best-effort")
     elif capacity_type == "reserved":
         command.append("--reserved")
     elif capacity_type == "spot":
@@ -102,8 +125,34 @@ def start_tpu_vm(tpu_name, *, tpu_type, capacity_type, version, zone, autodelete
         pass
     else:
         raise ValueError(f"Unknown capacity type: {capacity_type}")
+
+    if node_count == 1:
+        command.append(f"--node-id={tpu_name}")
+    else:
+        command.append(f"--node-count={node_count}")
+
     cli.run_command(*command)
 
+    # wait for queued resource to complete
+    print("Checking TPU creation status every minute...")
+    waited = 0
+    while True:
+        time.sleep(60)
+        waited += 1
+
+        tpu_stat = describe_tpu(tpu_name, zone)
+        assert tpu_stat is not None, f"{tpu_name} creation failed."
+
+        match tpu_stat["state"]["state"]:
+            case "ACTIVE":
+                break
+            case "FAILED":
+                raise RuntimeError(
+                    f"{tpu_name} creation failed: {tpu_stat['state']['failedData']['error']['message']}"
+                )
+            case _:
+                print(f"Status is {tpu_stat['state']['state']}. Waited {waited} minutes...")
+
 
 def _default_run_id():
     """Generate a run ID for wandb and continuation.
@@ -131,7 +180,11 @@ def _default_run_id():
     cli.add_arg(parser, config, ["--foreground"], default=False, action="store_true")
     cli.add_arg(parser, config, ["--image_name"], default=f"levanter-{getpass.getuser()}")
     cli.add_arg(
-        parser, config, ["--capacity_type"], default=None, choices=["preemptible", "spot", "reserved", "on-demand"]
+        parser,
+        config,
+        ["--capacity_type"],
+        default=None,
+        choices=["preemptible", "spot", "reserved", "on-demand", "best-effort"],
     )
     cli.add_arg(
         parser,
@@ -149,6 +202,7 @@ def _default_run_id():
     cli.add_arg(parser, config, ["--project"], default=cli.gcloud_config()["project"])
     cli.add_arg(parser, config, ["--tpu_name"], required=True)
     cli.add_arg(parser, config, ["--tpu_type"], required=True)
+    cli.add_arg(parser, config, ["--node_count"], default=1)
     cli.add_arg(parser, config, ["--version"], default="tpu-ubuntu2204-base")
     cli.add_arg(parser, config, ["--zone"], required=True)
     cli.add_arg(parser, config, ["--retries"], default=0, type=int)
@@ -178,6 +232,7 @@ def _default_run_id():
         retries = args.retries
     tpu_name = args.tpu_name
     tpu_type = args.tpu_type
+    node_count = args.node_count
     version = args.version
     zone = args.zone
     run_id = args.run_id
@@ -197,14 +252,25 @@ def _default_run_id():
     # make an image tag based on the unix timestamp to ensure we always pull the latest image
     tag = int(time.time())
 
-    full_image_id = push_docker.push_to_gcp(
-        project_id=project,
-        region=region,
-        repository=docker_repository,
-        image_name=image_id,
-        tag=tag,
-        docker_file="docker/tpu/Dockerfile.incremental",
-    )
+    if registry == "ghcr":
+        full_image_id = push_docker.push_to_github(
+            local_image=image_id,
+            tag=tag,
+            github_user=github_user,
+            github_token=github_token,
+            docker_file="docker/tpu/Dockerfile.incremental",
+        )
+    elif registry == "gcp":
+        full_image_id = push_docker.push_to_gcp(
+            project_id=project,
+            region=region,
+            repository=docker_repository,
+            image_name=image_id,
+            tag=tag,
+            docker_file="docker/tpu/Dockerfile.incremental",
+        )
+    else:
+        raise ValueError(f"Unknown docker registry: {args.docker_registry}")
 
     for i in range(retries + 1):
         try:
@@ -215,6 +281,7 @@ def _default_run_id():
                 version=version,
                 zone=zone,
                 autodelete=autodelete,
+                node_count=node_count,
             )
 
             # We don't technically need to setup on every run, but if we are working on a
@@ -222,32 +289,10 @@ def _default_run_id():
             setup_vm_docker(
                 tpu_name=tpu_name,
                 zone=zone,
+                node_count=node_count,
                 docker_base_image=docker_base_image,
             )
 
-            # make an image tag based on the unix timestamp to ensure we always pull the latest image
-            tag = int(time.time())
-
-            if registry == "ghcr":
-                full_image_id = push_docker.push_to_github(
-                    local_image=image_id,
-                    tag=tag,
-                    github_user=github_user,
-                    github_token=github_token,
-                    docker_file="docker/tpu/Dockerfile.incremental",
-                )
-            elif registry == "gcp":
-                full_image_id = push_docker.push_to_gcp(
-                    project_id=project,
-                    region=region,
-                    repository=docker_repository,
-                    image_name=image_id,
-                    tag=tag,
-                    docker_file="docker/tpu/Dockerfile.incremental",
-                )
-            else:
-                raise ValueError(f"Unknown docker registry: {args.docker_registry}")
-
             git_commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
 
             docker_command = [
@@ -277,8 +322,26 @@ def _default_run_id():
             docker_command.extend([full_image_id, " ".join(command)])
 
             print(f"Running on tpu_name... {tpu_name}")
-            cli.tpu_ssh(tpu_name, zone, *docker_command)
+            cli.tpu_ssh(tpu_name, zone, node_count, *docker_command)
         except subprocess.CalledProcessError as e:  # noqa: F841
-            print("Error running command.")
+            print(f"Error running command {e.cmd}")
             if i < retries - 1:
                 print("Retrying... %d/%d" % (i + 1, retries))
+        else:
+            print("Job finished with no error.")
+            break
+
+    if autodelete:
+        print("Autodelete is set to True. Tear down machine...")
+        cli.run_command(
+            "gcloud",
+            "alpha",
+            "compute",
+            "tpus",
+            "queued-resources",
+            "delete",
+            tpu_name,
+            "--quiet",
+            f"--zone={zone}",
+            "--force",
+        )

From 4950a8e4529c1ba4eb20e3d37e875a07b85d788e Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Thu, 25 Jul 2024 20:36:10 -0700
Subject: [PATCH 10/94] Fineweb Text + Partial revert of kiloshard (#669)

* Add llama 1b with fineweb txt

* replace with 50 fineweb urls

* wip

* revert many of the changes, which seems to fix the crashing

* revert many of the changes, which seems to fix the crashing

* remove now-unused option

* cleanup

* cleanup

* sigh

* Adding changes for dclm

---------

Co-authored-by: Ivan Zhou <ivan.zhouyq@gmail.com>
Co-authored-by: Abhinav Garg <abhinavg@stanford.edu>
---
 config/data/fineweb_llama_txt.yaml     | 124 +++++++
 config/llama_1b_with_fineweb_txt.yaml  |  29 ++
 pyproject.toml                         |   2 +-
 src/levanter/data/shard_cache.py       | 459 ++++++++++++-------------
 src/levanter/data/sharded_dataset.py   |   7 +-
 src/levanter/utils/ray_utils.py        |   1 -
 tests/test_shard_cache.py              |  28 +-
 tests/test_tokenized_document_cache.py |   8 +-
 8 files changed, 397 insertions(+), 261 deletions(-)
 create mode 100644 config/data/fineweb_llama_txt.yaml
 create mode 100644 config/llama_1b_with_fineweb_txt.yaml

diff --git a/config/data/fineweb_llama_txt.yaml b/config/data/fineweb_llama_txt.yaml
new file mode 100644
index 000000000..49c9dc888
--- /dev/null
+++ b/config/data/fineweb_llama_txt.yaml
@@ -0,0 +1,124 @@
+cache_dir: "gs://marin-data/tokenized/fineweb/llama2_tokenizer/txt"
+tokenizer: "meta-llama/Llama-2-7b-hf"
+stop_strategy: restart
+configs:
+  "fineweb":
+    train_urls:
+      # - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-*/*/*_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00000/{0..257}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00001/{0..258}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00002/{0..260}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00003/{0..261}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00004/{0..262}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00005/{0..262}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00006/{0..263}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00007/{0..263}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00008/{0..263}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00009/{0..263}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00010/{0..263}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00011/{0..265}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00012/{0..265}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00013/{0..266}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00014/{0..265}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00015/{0..265}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00016/{0..266}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00017/{0..266}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00018/{0..267}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00019/{0..266}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00020/{0..267}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00021/{0..267}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00022/{0..269}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00023/{0..267}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00024/{0..268}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00025/{0..268}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00026/{0..269}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00027/{0..269}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00028/{0..269}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00029/{0..269}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00030/{0..270}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00031/{0..270}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00032/{0..270}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00033/{0..271}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00034/{0..271}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00035/{0..271}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00036/{0..272}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00037/{0..272}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00038/{0..272}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00039/{0..272}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00040/{0..272}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00041/{0..272}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00042/{0..273}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00043/{0..272}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00044/{0..273}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00045/{0..274}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00046/{0..274}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00047/{0..273}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00048/{0..274}_processed.jsonl.gz
+      - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00049/{0..275}_processed.jsonl.gz
+  # these are just for eval
+  "paloma/4chan":
+    validation_urls:
+      - gs://levanter-data/paloma/4chan_meta_sep/val/val*.jsonl.gz
+  "paloma/c4_100_domains":
+    validation_urls:
+      - gs://levanter-data/paloma/c4_100_domains/val/val*.jsonl.gz
+  "paloma/c4_en":
+    validation_urls:
+      - gs://levanter-data/paloma/c4_en/val/val*.jsonl.gz
+  "paloma/dolma-v1_5":
+    validation_urls:
+      - gs://levanter-data/paloma/dolma-v1_5/val/val*.jsonl.gz
+  "paloma/dolma_100_programing_languages":
+    validation_urls:
+      - gs://levanter-data/paloma/dolma_100_programing_languages/val/val*.jsonl.gz
+  "paloma/dolma_100_subreddits":
+    validation_urls:
+      - gs://levanter-data/paloma/dolma_100_subreddits/val/val*.jsonl.gz
+  "paloma/falcon-refinedweb":
+    validation_urls:
+      - gs://levanter-data/paloma/falcon-refinedweb/val/val*.jsonl.gz
+  "paloma/gab":
+    validation_urls:
+      - gs://levanter-data/paloma/gab/val/val*.jsonl.gz
+  "paloma/m2d2_s2orc_unsplit":
+    validation_urls:
+      - gs://levanter-data/paloma/m2d2_s2orc_unsplit/val/val*.jsonl.gz
+  "paloma/m2d2_wikipedia_unsplit":
+    validation_urls:
+      - gs://levanter-data/paloma/m2d2_wikipedia_unsplit/val/val*.jsonl.gz
+  "paloma/manosphere_meta_sep":
+    validation_urls:
+      - gs://levanter-data/paloma/manosphere_meta_sep/val/val*.jsonl.gz
+  "paloma/mc4":
+    validation_urls:
+      - gs://levanter-data/paloma/mc4/val/val*.jsonl.gz
+  "paloma/ptb":
+    validation_urls:
+      - gs://levanter-data/paloma/ptb/val/val*.jsonl.gz
+  "paloma/redpajama":
+    validation_urls:
+      - gs://levanter-data/paloma/redpajama/val/val*.jsonl.gz
+  "paloma/twitterAAE_HELM_fixed":
+    validation_urls:
+      - gs://levanter-data/paloma/twitterAAE_HELM_fixed/val/val*.jsonl.gz
+  "paloma/wikitext_103":
+    validation_urls:
+      - gs://levanter-data/paloma/wikitext_103/val/val*.jsonl.gz
+train_weights:
+  fineweb: 1.0
+  paloma/4chan: 0.0
+  paloma/c4_100_domains: 0.0
+  paloma/c4_en: 0.0
+  paloma/dolma-v1_5: 0.0
+  paloma/dolma_100_programing_languages: 0.0
+  paloma/dolma_100_subreddits: 0.0
+  paloma/falcon-refinedweb: 0.0
+  paloma/gab: 0.0
+  paloma/m2d2_s2orc_unsplit: 0.0
+  paloma/m2d2_wikipedia_unsplit: 0.0
+  paloma/manosphere_meta_sep: 0.0
+  paloma/mc4: 0.0
+  paloma/ptb: 0.0
+  paloma/redpajama: 0.0
+  paloma/twitterAAE_HELM_fixed: 0.0
+  paloma/wikitext_103: 0.0
diff --git a/config/llama_1b_with_fineweb_txt.yaml b/config/llama_1b_with_fineweb_txt.yaml
new file mode 100644
index 000000000..1251edecd
--- /dev/null
+++ b/config/llama_1b_with_fineweb_txt.yaml
@@ -0,0 +1,29 @@
+data: !include data/fineweb_llama_txt.yaml
+model:  # 1B class model
+  type: llama
+  seq_len: 2048
+  hidden_dim: 2048
+  intermediate_dim: 8192
+  num_layers: 16
+  num_heads: 16
+  num_kv_heads: 16
+  use_flash_attention: True
+  flash_attention_block_size: 1024
+trainer:
+  tracker:
+    type: wandb
+    project: "marin"
+    tags: ["fineweb", "llama"]
+
+  mp: p=f32,c=bfloat16
+  train_batch_size: 1024
+  num_train_steps: 750000  # 3,000,000,000,000 / 4,000,000 = 750,000
+  steps_per_eval: 1000
+  tensor_parallel_axes: ["mlp", "heads"]
+  fsdp_axis: "embed"
+  batch_axis: "batch"
+optimizer:
+  learning_rate: 4E-4
+  weight_decay: 0.1
+  min_lr_ratio: 0.1
+  warmup: 5000
diff --git a/pyproject.toml b/pyproject.toml
index 489219016..c1cbc8427 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,7 +50,7 @@ dependencies = [
     "matplotlib>=3.7.0",
     "tblib>=1.7.0,<4.0.0",
     "dataclasses-json~=0.6.4",
-    "ray[default]~=2.10",
+    "ray[default]==2.32.0",
     "pydantic<3",  # temporary pin until Ray supports pydantic 2.0
     "rich~=13.0",
     "filelock~=3.13",
diff --git a/src/levanter/data/shard_cache.py b/src/levanter/data/shard_cache.py
index eb006bbdc..f27008e45 100644
--- a/src/levanter/data/shard_cache.py
+++ b/src/levanter/data/shard_cache.py
@@ -4,7 +4,6 @@
 import heapq
 import logging as pylogging
 import os
-import random
 import threading
 import time
 from contextlib import AbstractContextManager
@@ -29,7 +28,6 @@
     ser_exc_info,
 )
 from ._preprocessor import BatchProcessor, BatchResult, as_record_batch, dict_from_record_batch
-from ._process_interleave import GroupRoundRobinBuffer, InProgressSequence
 from ._queue import (
     PriorityProcessorActor,
     PriorityWorkItem,
@@ -55,6 +53,7 @@
 LEDGER_FILE_NAME = "cache_ledger.json"
 
 LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+LEVEL_TO_LOG = pylogging.DEBUG
 
 
 def build_or_load_cache(
@@ -66,9 +65,6 @@ def build_or_load_cache(
     await_finished: bool = True,
     monitors: Optional[Sequence["MetricsMonitor"]] = None,
     cache_config: Optional[Dict[str, Any]] = None,
-    *,
-    randomize_shards: bool = True,
-    shards_to_read_at_once: int = DEFAULT_MAX_SHARDS_TO_READ_AT_ONCE,
 ) -> "ShardCache":
     """
     Produces a sharded cache of the dataset using Ray for distributed processing. The cache can be any path
@@ -107,8 +103,6 @@ def build_or_load_cache(
         batch_size=batch_size,
         rows_per_chunk=rows_per_chunk,
         cache_config=cache_config,
-        randomize_shards=randomize_shards,
-        shards_to_read_at_once=shards_to_read_at_once,
     )
 
     if cache.is_finished:
@@ -347,30 +341,31 @@ def _shard_reader_generator(shard_source: ShardedDataset[T], shard_name: str, st
 
 
 @dataclass
-class ShardToBeProcessed(PriorityWorkTaskGroupSpec):
+class ShardGroupToBeProcessed(PriorityWorkTaskGroupSpec):
     name: str
     builder_ref: ray.actor.ActorHandle  # _ChunkCacheBuilder
     writer: ray.actor.ActorHandle  # _GroupedShardWriter
     shard_source: ShardedDataset
-    shard_name: str
-    priority_fn: Callable[[int], float]
+    shard_names: Sequence[str]
+    priority_fn: Callable[[int, int], float]
     processor_actor: ray.actor.ActorHandle  # BatchProcessorQueue
     batch_size: int
     num_rows_per_chunk: int
     group_id: int
 
     def build(self) -> "PriorityWorkTaskGroup":
-        return ShardTaskGroup(self)
+        return ShardGroupTaskGroup(self)
 
 
-class ShardTaskGroup(PriorityWorkTaskGroup):
-    def __init__(self, spec: ShardToBeProcessed):
-        self.spec: ShardToBeProcessed = spec
+class ShardGroupTaskGroup(PriorityWorkTaskGroup):
+    def __init__(self, spec: ShardGroupToBeProcessed):
+        self.spec = spec
         self.logger = pylogging.getLogger(f"shard_reader.{spec.group_id}.{spec.name}")
 
-        shard_name = self.spec.shard_name
         try:
-            shard_metadata: ShardMetadata = _initial_shard_metadata(shard_name, self.spec.writer)
+            metadata: dict[str, ShardMetadata] = _initial_shard_metadatas(
+                self.spec.shard_source, self.spec.shard_names, self.spec.writer
+            )
         except Exception as e:
             self.spec.builder_ref.other_failed.remote(ser_exc_info())
             raise e
@@ -379,22 +374,27 @@ def __init__(self, spec: ShardToBeProcessed):
 
         self._items: list[PriorityWorkItem] = []
 
-        try:
-            reader = _shard_reader_generator(self.spec.shard_source, shard_name, shard_metadata.total_rows, batch_size)
+        for shard_name in self.spec.shard_names:
+            shard_idx = self.spec.shard_source.shard_names.index(shard_name)
+            try:
+                shard_metadata = metadata[shard_name]
+                reader = _shard_reader_generator(
+                    self.spec.shard_source, shard_name, shard_metadata.total_rows, batch_size
+                )
 
-            if shard_metadata.is_finished:
-                self.logger.info(f"Shard {shard_name} already finished. Skipping.")
+                if shard_metadata.is_finished:
+                    self.logger.info(f"Shard {shard_name} already finished. Skipping.")
 
-            task_name = f"shard_reader.{self.spec.name}.{shard_name}"
+                task_name = f"shard_reader.{self.spec.name}.{shard_name}"
 
-            chunk_idx = len(shard_metadata.chunks)
-            item = ShardReaderItem(self, task_name, shard_name, chunk_idx, reader)
+                chunk_idx = len(shard_metadata.chunks)
+                item = ShardReaderItem(self, task_name, shard_name, shard_idx, chunk_idx, reader)
 
-            heapq.heappush(self._items, item)
-        except Exception as e:
-            self.logger.exception(f"Error while initializing shard {shard_name}")
-            self.spec.writer[shard_name].shard_failed.remote(ser_exc_info())
-            raise e
+                heapq.heappush(self._items, item)
+            except Exception as e:
+                self.logger.exception(f"Error while initializing shard {shard_name}")
+                self.spec.writer[shard_name].shard_failed.remote(ser_exc_info())
+                raise e
 
     @property
     def name(self):
@@ -412,15 +412,16 @@ class ShardReaderItem(PriorityWorkItem):
     and dispatches them to the processor.
     """
 
-    group: ShardTaskGroup
+    group: ShardGroupTaskGroup
     name: str
     shard_name: str
+    shard_idx: int
     chunk_idx: int
     reader: Iterator[list]
 
     @property
     def priority(self):
-        return self.group.spec.priority_fn(self.chunk_idx)
+        return self.group.spec.priority_fn(self.shard_idx, self.chunk_idx)
 
     @property
     def spec(self):
@@ -448,7 +449,7 @@ def execute(self) -> tuple[bool, Optional[ray.ObjectRef]]:
                 total_chunk_rows += len(batch)
 
                 if batch:
-                    priority = self.spec.priority_fn(self.chunk_idx)
+                    priority = self.spec.priority_fn(self.shard_idx, self.chunk_idx)
                     # these times aren't exact because the times might be from different machines
                     # but they're just for logging
                     time_in = time.time()
@@ -489,8 +490,13 @@ def execute(self) -> tuple[bool, Optional[ray.ObjectRef]]:
             raise e
 
 
-def _initial_shard_metadata(shard_name, shard_group_writer):
-    return ray.get(shard_group_writer.current_metadata.remote(shard_name))
+def _initial_shard_metadatas(shard_source, shard_names, shard_group_writer):
+    shard_metadatas: dict[str, ShardMetadata] = {}
+    _metadata_futures = [shard_group_writer.current_metadata.remote(name) for name in shard_names]
+    shard_metadatas_rs = ray.get(_metadata_futures)
+    for shard_name, shard_metadata in zip(shard_names, shard_metadatas_rs):
+        shard_metadatas[shard_name] = shard_metadata
+    return shard_metadatas
 
 
 def _serialize_json_and_commit(path, obj):
@@ -541,7 +547,7 @@ def is_finished_and_buffer_empty(self):
 class _GroupShardWriterWorker:
     def __init__(self, parent_ref, cache_dir: str, shard_names: Sequence[str]):
         with log_failures_to(parent_ref):
-            pylogging.basicConfig(level=pylogging.INFO, format=LOG_FORMAT)
+            pylogging.basicConfig(level=LEVEL_TO_LOG, format=LOG_FORMAT)
             self.cache_dir = cache_dir
             self.shard_names = shard_names
             self.shard_writers: dict[str, _ShardWriterWorker] = {
@@ -615,7 +621,7 @@ def __init__(
         cache_dir: str,
         shard_name: str,
     ):
-        pylogging.basicConfig(level=pylogging.INFO, format=LOG_FORMAT)
+        pylogging.basicConfig(level=LEVEL_TO_LOG, format=LOG_FORMAT)
         self.parent_ref = parent_ref
         self.cache_dir = cache_dir
         self.shard_name = shard_name
@@ -806,185 +812,118 @@ def __init__(
         source: ShardedDataset[T],
         processor: BatchProcessor[T],
         rows_per_chunk: int,
-        randomize_shards: bool,
-        shards_to_read_at_once: int,
     ):
         with log_failures_to(broker_ref):
             pylogging.basicConfig(level=pylogging.INFO, format=LOG_FORMAT)
-            self.name = name
             self.logger = pylogging.getLogger(f"{__name__}.{name}")
             self.broker_ref = broker_ref
+            self.shard_status: Dict[str, _ShardStatus] = dict()
+            self._current_round_robin = []
             self.source = source
-            self.processor = processor
-            self.rows_per_chunk = rows_per_chunk
-            self.cache_dir = cache_dir
-            self._shard_writers: list = []
             self._metrics = InProgressCacheMetrics()
 
+            self_ref = current_actor_handle()
+
             if len(source.shard_names) == 0:
                 self.logger.warning("No shards to index?!?")
                 self._finish()
-                return
-
-            self.logger.info(f"Starting cache build for {len(source.shard_names)} shards")
-            # we process shards in groups of DEFAULT_MAX_SHARDS_TO_READ_AT_ONCE
-            # once one group is done, we move on to the next group
-            # this is to avoid having too many open file handles at once (we have seen 30K shards)
-            # once a shard group is finished, we start the next group
-            num_shards = len(source.shard_names)
-            num_shard_groups = min(num_shards, shards_to_read_at_once)
-            self._num_groups = num_shard_groups
-
-            # we do a permutation of the shard names (using a seed) to get a stable order and as a kind of poor man's shuffle
-            shuffled_shards = list(source.shard_names)
-            if randomize_shards:
-                random.Random(42).shuffle(shuffled_shards)
-            # now assign shards to groups
-            self._shard_groups = [shuffled_shards[i::num_shard_groups] for i in range(num_shard_groups)]
-            self._shard_name_to_group = {}
-            for group_id, shard_group in enumerate(self._shard_groups):
-                for shard_name in shard_group:
-                    self._shard_name_to_group[shard_name] = group_id
-
-            self._chunk_counts_for_group = [0] * num_shard_groups
-            self._chunk_counts_for_shard: dict[str, int] = {s: 0 for s in source.shard_names}
-            self._active_shard_for_group = [-1] * num_shard_groups
-            self._expected_chunk_totals: dict[str, int] = {}
-
-            self._current_round_robin: GroupRoundRobinBuffer[int, ChunkMetadata] = GroupRoundRobinBuffer(
-                range(num_shard_groups)
-            )
-
-            self._shard_readers = self._initialize_workers()
-            # if we have a bunch of caches to build with one shard, we don't want them all
-            # assigned to the same node, so we use an offset based on the hash of the name (for stability)
-            # in an attempt to spread them out
-            self._worker_offset = int(hash(name) % len(self._shard_readers))
-
-            for group in range(num_shard_groups):
-                self._kick_off_next_shard_for_group(group)
-
-    def _kick_off_next_shard_for_group(self, group):
-        self_ref = current_actor_handle()
-
-        def priority_fn(chunk_idx):
-            return chunk_idx * self._num_groups + group
+            else:
+                self.logger.info(f"Starting cache build for {len(source.shard_names)} shards")
+
+                self._shard_writers = []
+                self._shard_readers = []
+                self._processor_actors = []
+
+                for shard_name in source.shard_names:
+                    self._current_round_robin.append(shard_name)
+                    self.shard_status[shard_name] = _ShardStatus()
+
+                num_shards = len(source.shard_names)
+                num_worker_groups = len(ray.nodes())
+                num_shard_groups = max(min(num_worker_groups, num_shards), 1)
+
+                # if we have a bunch of caches to build with one shard, we don't want them all
+                # assigned to the same node, so we use an offset based on the hash of the name (for stability)
+                # in an attempt to spread them out
+                group_offset = int(hash(name) % num_worker_groups)
+
+                shard_groups: list[list[str]] = [[] for _ in range(num_shard_groups)]
+                for i, shard_name in enumerate(source.shard_names):
+                    shard_groups[i % num_shard_groups].append(shard_name)
+
+                def priority_fn(shard_idx, chunk_idx):
+                    return chunk_idx * num_shards + shard_idx
+
+                for group_id, shard_group in enumerate(shard_groups):
+                    writer = _GroupShardWriterWorker.remote(self_ref, cache_dir, shard_group)  # type: ignore
+                    self._shard_writers.append(writer)
+
+                    # TODO: would probably be better if we didn't create one of these per shard group
+                    processor_actor = _BatchProcessorQueue.remote(processor)  # type: ignore
+                    self._processor_actors.append(processor_actor)
+
+                    work_item = ShardGroupToBeProcessed(
+                        name=name,
+                        builder_ref=self_ref,
+                        writer=writer,
+                        shard_source=source,
+                        shard_names=shard_group,
+                        priority_fn=priority_fn,
+                        processor_actor=processor_actor,
+                        batch_size=processor.batch_size,
+                        num_rows_per_chunk=rows_per_chunk,
+                        group_id=group_id,
+                    )
 
-        shard_group = self._shard_groups[group]
-        next_shard_in_group = self._active_shard_for_group[group] + 1
+                    # we want global names so that different tasks can coordinate priorities
+                    worker_to_assign = (group_id + group_offset) % num_worker_groups
+                    priority_actor_name = f"priority_processor.{worker_to_assign}"
 
-        if next_shard_in_group >= len(shard_group):
-            self.logger.debug(f"Group {group} finished")
-            return
+                    reader_actor = PriorityProcessorActor.options(  # type: ignore
+                        name=priority_actor_name, get_if_exists=True
+                    ).remote()
 
-        to_process = [shard_group[next_shard_in_group]]
-
-        writer = _GroupShardWriterWorker.remote(self_ref, self.cache_dir, to_process)  # type: ignore
-        self._shard_writers.append(writer)
-
-        # TODO: would probably be better if we didn't create one of these per shard
-        processor_actor = _BatchProcessorQueue.remote(self.processor)  # type: ignore
-
-        work_item = ShardToBeProcessed(
-            name=self.name,
-            builder_ref=self_ref,
-            writer=writer,
-            shard_source=self.source,
-            shard_name=shard_group[next_shard_in_group],
-            priority_fn=priority_fn,
-            processor_actor=processor_actor,
-            batch_size=self.processor.batch_size,
-            num_rows_per_chunk=self.rows_per_chunk,
-            group_id=group,
-        )
+                    reader_actor.assign_work.remote(work_item)
 
-        # we want global names so that different tasks can coordinate priorities
-        worker_to_assign = (next_shard_in_group + self._worker_offset) % len(self._shard_readers)
-        self._active_shard_for_group[group] = next_shard_in_group
-        self._shard_readers[worker_to_assign].assign_work.remote(work_item)
-
-    def _initialize_workers(self):
-        shard_readers = []
-        num_workers = len(ray.nodes())
-        if num_workers == 0:
-            raise ValueError("No workers available")
-        for worker_id in range(num_workers):
-            priority_actor_name = f"priority_processor.{worker_id}"
-            reader_actor = PriorityProcessorActor.options(  # type: ignore
-                name=priority_actor_name, get_if_exists=True
-            ).remote()
-
-            shard_readers.append(reader_actor)
-        return shard_readers
+                    self._shard_readers.append(reader_actor)
 
     def new_chunk(self, shard_name: str, *chunks: ChunkMetadata):
         """Callback method for when a shard worker has produced a new chunk."""
-        with log_failures_to(self.broker_ref):
-            # self._current_round_robin.extend_group(shard_name, *chunks)
-            group_id = self._shard_name_to_group[shard_name]
-            count = self._chunk_counts_for_group[group_id]
-            for chunk in chunks:
-                self._current_round_robin.append_to_group(group_id, count, chunk)
-                count += 1
-
-            self._chunk_counts_for_group[group_id] = count
-            self._chunk_counts_for_shard[shard_name] = self._chunk_counts_for_shard.get(shard_name, 0) + len(chunks)
-
-            # if we have buffered chunks, we need to check if we can send them to the broker
-            self._attempt_to_flush_buffers()
-            self._update_metrics_for_new_chunks(chunks)
-
-            if shard_name in self._expected_chunk_totals:
-                if self._chunk_counts_for_shard[shard_name] == self._expected_chunk_totals[shard_name]:
-                    self._kick_off_next_shard_for_group(group_id)
-                elif count > self._expected_chunk_totals[shard_name]:
-                    logger.error(f"Received more chunks than expected for {shard_name}")
-                    error = ValueError(f"Received more chunks than expected for {shard_name}")
-                    self.other_failed(ser_exc_info(error))
-                    raise error
-
-    def _update_metrics_for_new_chunks(self, chunks):
+        self.shard_status[shard_name].current_buffer.extend(chunks)
+
+        # if we have buffered chunks, we need to check if we can send them to the broker
+        self._attempt_to_flush_buffers()
+
         self._metrics.chunks_finished += len(chunks)
         # update metrics
         for chunk in chunks:
             self._metrics.rows_finished += chunk.num_rows
-            for field, field_count in chunk.field_counts.items():
-                self._metrics.field_counts[field] = self._metrics.field_counts.get(field_count, 0) + field_count
+            for field, count in chunk.field_counts.items():
+                self._metrics.field_counts[field] = self._metrics.field_counts.get(field, 0) + count
+
         if len(chunks) > 0:
             ray.get(self.broker_ref._new_metrics.remote(self._metrics))
 
     def shard_finished(self, shard_name: str, expected_num_chunks: int):
         """Callback method for when a shard worker has finished."""
-        with log_failures_to(self.broker_ref):
-            logger.info(f"Shard {shard_name} finished")
-            group_id = self._shard_name_to_group[shard_name]
-            self._expected_chunk_totals[shard_name] = expected_num_chunks
-
-            group_total = self._group_total_if_known(group_id)
-            if group_total is not None:
-                self._current_round_robin.group_total_known(group_id, group_total)
-
-            self._attempt_to_flush_buffers()
-            self._metrics.shards_finished += 1
-            ray.get(self.broker_ref._new_metrics.remote(self._metrics))
-
-            if self._chunk_counts_for_shard[shard_name] == self._expected_chunk_totals[shard_name]:
-                self._kick_off_next_shard_for_group(group_id)
-
-            # if there are no more active shards, we're done
-            if self._current_round_robin.is_finished():
-                self._finish()
+        shard_status = self.shard_status[shard_name]
+        assert (
+            shard_status.expected_num_chunks is None
+        ), f"Shard {shard_name} already finished: {shard_status.expected_num_chunks} {expected_num_chunks}"
+        shard_status.expected_num_chunks = expected_num_chunks
+
+        # we might still have buffered chunks, so we need to check if we can append them
+        self._attempt_to_flush_buffers()
+        self._metrics.shards_finished += 1
+        ray.get(self.broker_ref._new_metrics.remote(self._metrics))
 
-    def _group_total_if_known(self, group_id):
-        total = 0
-        if self._active_shard_for_group[group_id] != len(self._shard_groups[group_id]) - 1:
-            return None
-        for shard_name in self._shard_groups[group_id]:
-            if shard_name not in self._expected_chunk_totals:
-                return None
-            total += self._expected_chunk_totals[shard_name]
+        # if there are no more active shards, we're done
+        if self._all_shards_done():
+            assert len(self._current_round_robin) == 0
+            self._finish()
 
-        return total
+    def _all_shards_done(self):
+        return all(status.is_finished_and_buffer_empty for status in self.shard_status.values())
 
     def shard_failed(self, shard_name: str, error: ExceptionInfo):
         """Callback method for when a shard worker has failed."""
@@ -1009,25 +948,60 @@ def _attempt_to_flush_buffers(self):
         # If we can, we send them to the broker
 
         # here "finished" means that the shard has sent all of its chunks and has told us that it's done.
-        chunks_to_send = self._current_round_robin.drain()
+
+        chunks_to_send = []
+
+        while len(self._current_round_robin) > 0:
+            name = self._current_round_robin[0]
+            status = self.shard_status[name]
+            if status.is_finished_and_buffer_empty:
+                # we're done with this shard, so we can remove it from the roundrobin
+                self._current_round_robin.pop(0)
+                logger.debug(f"Shard {name} is finished, removing from round robin")
+                continue
+
+            # now let's see if we can send a chunk from this shard
+            next_chunk = status.pop_chunk_to_send()
+            if next_chunk is not None:
+                # we can send a chunk from this shard
+                self._current_round_robin.pop(0)
+                self._current_round_robin.append(name)
+                chunks_to_send.append(next_chunk)
+                continue
+            else:
+                # we can't send a chunk from this shard, so we can't send any additional chunks
+                if self.logger.level <= pylogging.DEBUG:
+                    chunks_waiting = [
+                        f"{n2} ({len(s2.current_buffer)})"
+                        for n2, s2 in self.shard_status.items()
+                        if len(s2.current_buffer) > 0
+                    ]
+                    msg = (
+                        f"Shard {name} has no chunks to send and is not known to be finished. We have this many queued"
+                        f" chunks: {chunks_waiting}"
+                    )
+                    self.logger.debug(msg)
+                break
+
         if len(chunks_to_send) > 0:
+            logger.debug(f"Sending {len(chunks_to_send)} chunks to broker")
             ray.get(self.broker_ref._append_chunks.remote(*chunks_to_send))
 
     def _finish(self):
         self._metrics.is_finished = True
         ray.get(self.broker_ref._new_metrics.remote(self._metrics))
         ray.get(self.broker_ref._finalize.remote())
-        self._shard_writers = []
-        self._shard_readers = []
+        # self._shard_writers = []
+        # self._shard_readers = []
 
 
 @ray.remote(num_cpus=0)
 class ChunkCacheBroker(SnitchRecipient):
     """Actor that manages the global order on chunks and vends chunk metadata to readers."""
 
-    _chunks: InProgressSequence[ChunkMetadata]
-    _latest_metrics: InProgressCacheMetrics
-    _metrics_condition: asyncio.Condition
+    chunks: List[ChunkMetadata]
+    _reader_promises: Dict[int, asyncio.Future[ChunkMetadata]]
+    _finished_promise: asyncio.Future[None]
 
     def __init__(
         self,
@@ -1036,21 +1010,19 @@ def __init__(
         processor: BatchProcessor[T],
         rows_per_chunk: int,
         cache_config: Optional[Dict[str, Any]],
-        randomize_shards: bool,
-        shards_to_read_at_once: int,
     ):
         pylogging.basicConfig(level=pylogging.INFO, format=LOG_FORMAT)
-
-        self._chunks = InProgressSequence()
+        self.chunks = []
+        self._reader_promises = {}
         self._is_finished = False
         self._source = source
         self._processor = processor
         self._cache_dir = cache_dir
+        self._rows_per_chunk = rows_per_chunk
+        self._finished_promise = asyncio.Future()
         # used to subscribe to metrics updates
         self._latest_metrics = InProgressCacheMetrics()
         self._metrics_condition = asyncio.Condition()
-        self._finished_sentinel: asyncio.Future[None] = asyncio.Future()
-
         self._cache_config = cache_config
         path_for_name = os.path.join(*self._cache_dir.split("/")[-2:])
         name = f"broker::{path_for_name}"
@@ -1060,25 +1032,32 @@ def __init__(
         # first see if we need to do anything: check the ledger for is_finished
         try:
             cache_ledger = _load_cache_ledger(self._cache_dir)
+            self._append_chunks(*cache_ledger.chunks)
+            self._is_finished = True
+            self._finished_promise.set_result(None)
         except FileNotFoundError:
             self_ref = ray.runtime_context.get_runtime_context().current_actor
             # only use the last two components of the name since it gets kind of long
             name = f"builder::{path_for_name}"
-            self._builder_actor = ChunkCacheBuilder.remote(self_ref, self._cache_dir, name, self._source, self._processor, rows_per_chunk, randomize_shards=randomize_shards, shards_to_read_at_once=shards_to_read_at_once)  # type: ignore
-        else:
-            self._append_chunks(*cache_ledger.chunks)
-            self._finalize()
+            self._builder_actor = ChunkCacheBuilder.remote(  # type: ignore
+                self_ref,
+                self._cache_dir,
+                name,
+                self._source,
+                self._processor,
+                rows_per_chunk,
+            )  # type: ignore
 
     def is_finished(self):
-        return self._chunks.is_finished()
+        return self._is_finished
 
     async def finished_sentinel(self):
-        await self._finished_sentinel
+        await self._finished_promise
 
     async def updated_metrics(self) -> InProgressCacheMetrics:
-        if self.is_finished():
-            if self._chunks.finished_promise.exception() is not None:
-                raise self._chunks.finished_promise.exception()  # type: ignore
+        if self._finished_promise.done():
+            if self._finished_promise.exception() is not None:
+                raise self._finished_promise.exception()  # type: ignore
             else:
                 return self._latest_metrics
 
@@ -1087,20 +1066,33 @@ async def updated_metrics(self) -> InProgressCacheMetrics:
             return self._latest_metrics
 
     async def get_chunk(self, chunk_idx: int) -> Optional[ChunkMetadata]:
-        try:
-            return await self._chunks.get(chunk_idx)
-        except IndexError:
+        assert isinstance(self.chunks, list), self.chunks
+        if chunk_idx < len(self.chunks):
+            return self.chunks[chunk_idx]
+        elif self._is_finished:
             return None
+        elif self._finished_promise.exception() is not None:
+            raise self._finished_promise.exception()  # type: ignore
+        else:
+            if chunk_idx not in self._reader_promises:
+                self._reader_promises[chunk_idx] = asyncio.Future()
+            return await self._reader_promises[chunk_idx]
 
     async def final_chunk_count(self) -> Optional[int]:
-        await self._chunks.finished_promise
-        return self._chunks.final_length()
+        if self._is_finished:
+            return len(self.chunks)
+        else:
+            return None
 
     def _append_chunks(self, *chunks: ChunkMetadata):
         for chunk in chunks:
-            self._chunks.append(chunk)
-            chunk_idx = self._chunks.current_length()
+            self.chunks.append(chunk)
+            chunk_idx = len(self.chunks) - 1
             self.logger.debug(f"Received chunk {chunk_idx}")
+            if chunk_idx in self._reader_promises:
+                self.logger.debug(f"Resolving promise for chunk {chunk_idx}")
+                self._reader_promises[chunk_idx].set_result(chunk)
+                del self._reader_promises[chunk_idx]
 
     def _new_metrics(self, metrics):
         self._latest_metrics = metrics
@@ -1122,24 +1114,36 @@ def _child_failed(self, child: ray.actor.ActorHandle, exception: ExceptionInfo):
 
     def _writer_exception(self, shard_name, exc_info: ExceptionInfo):
         info = exc_info.restore()
+
         logger.exception(f"Writer task {shard_name} failed with exception", exc_info=info)
-        self._finished_sentinel.set_exception(info[1])
-        self._chunks.set_exception(info[1])
+        for future in self._reader_promises.values():
+            future.set_exception(info[1])
+
+        self._reader_promises = {}
+
+        self._finished_promise.set_exception(info[1])
         self._do_notify()
 
     def _finalize(self):
         logger.info(f"Finalizing cache {self._cache_dir}...")
         self._is_finished = True
-        self._chunks.finalize()
-        assert self._chunks.is_finished()
-        self._finished_sentinel.set_result(None)
-        self._do_notify()
+        for k, future in self._reader_promises.items():
+            future.set_result(None)
 
         # write ledger
         _serialize_json_and_commit(
-            os.path.join(self._cache_dir, LEDGER_FILE_NAME), CacheLedger(self._chunks.to_list(), self._cache_config)
+            os.path.join(self._cache_dir, LEDGER_FILE_NAME), CacheLedger(self.chunks, self._cache_config)
         )
 
+        self._reader_promises = {}
+        # TODO: For some reason this crashes other actors with weird reference counting assertion errors.
+        # pretty sure it's a ray bug
+        # self._builder_actor = None
+        self._finished_promise.set_result(None)
+
+        # notify metrics subscribers
+        self._do_notify()
+
 
 def _get_broker_actor(
     cache_dir,
@@ -1147,19 +1151,16 @@ def _get_broker_actor(
     processor,
     cache_config=None,
     rows_per_chunk=DEFAULT_ROWS_PER_CHUNK,
-    *,
-    randomize_shards,
-    shards_to_read_at_once,
 ):
-    return ChunkCacheBroker.options(name="lev_cache_manager::" + cache_dir, get_if_exists=True).remote(
+    return ChunkCacheBroker.options(
+        name="lev_cache_manager::" + cache_dir.replace("/", "--"), get_if_exists=True, lifetime="detached"
+    ).remote(
         # type: ignore
         cache_dir=cache_dir,
         source=input_shards,
         processor=processor,
         cache_config=cache_config,
         rows_per_chunk=rows_per_chunk,
-        randomize_shards=randomize_shards,
-        shards_to_read_at_once=shards_to_read_at_once,
     )
 
 
@@ -1275,8 +1276,6 @@ def build_or_load(
         batch_size: int,
         rows_per_chunk: int,
         cache_config: Optional[Dict[str, Any]] = None,
-        randomize_shards: bool = True,
-        shards_to_read_at_once: int = DEFAULT_MAX_SHARDS_TO_READ_AT_ONCE,
     ):
         try:
             return ShardCache.load(cache_dir, batch_size)
@@ -1287,8 +1286,6 @@ def build_or_load(
                 processor=processor,
                 cache_config=cache_config,
                 rows_per_chunk=rows_per_chunk,
-                randomize_shards=randomize_shards,
-                shards_to_read_at_once=shards_to_read_at_once,
             )
             return ShardCache(cache_dir=cache_dir, batch_size=batch_size, ledger=None, _broker=broker)
 
@@ -1469,7 +1466,7 @@ def attach_metrics_monitor(self, monitor: MetricsMonitor):
 
         self._metrics_monitors.append(monitor)
         if self._monitor_thread is None:
-            self._monitor_thread = threading.Thread(target=self._monitor_metrics, daemon=True)
+            self._monitor_thread = threading.Thread(target=self._monitor_metrics)
             self._monitor_thread.start()
 
     def _monitor_metrics(self):
diff --git a/src/levanter/data/sharded_dataset.py b/src/levanter/data/sharded_dataset.py
index a63ef979b..e16f5fce7 100644
--- a/src/levanter/data/sharded_dataset.py
+++ b/src/levanter/data/sharded_dataset.py
@@ -69,7 +69,6 @@ def build_or_load_cache(
         rows_per_chunk: Optional[int] = None,
         await_finished: bool = True,
         monitors: Optional[Sequence["MetricsMonitor"]] = None,
-        randomize_shards: bool = False,
     ) -> ShardableDataset[dict]:
         """
         Constructs a shard cache version of this dataset using Ray.
@@ -101,7 +100,6 @@ def build_or_load_cache(
             rows_per_chunk=rows_per_chunk,
             await_finished=await_finished,
             monitors=monitors,
-            randomize_shards=randomize_shards,
         )
         return DictCacheDataset(cache)
 
@@ -208,7 +206,10 @@ def shard_names(self) -> Sequence[str]:
     def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[str]:
         url = self._shard_name_to_url_mapping[shard_name]
         i = 0
-        with fsspec.open(url, "r", compression="infer") as f:
+        compression = "infer"
+        if url.endswith(".zstd"):  # hacky way to detect zstd
+            compression = "zstd"
+        with fsspec.open(url, "r", compression=compression) as f:
             format = _sniff_format_for_dataset(url)
             match format:
                 case ".jsonl":
diff --git a/src/levanter/utils/ray_utils.py b/src/levanter/utils/ray_utils.py
index 6a57d77cb..255968815 100644
--- a/src/levanter/utils/ray_utils.py
+++ b/src/levanter/utils/ray_utils.py
@@ -91,4 +91,3 @@ def log_failures_to(parent):
         yield
     except Exception as e:
         parent._child_failed.remote(current_actor_handle(), ser_exc_info(e))
-        raise e
diff --git a/tests/test_shard_cache.py b/tests/test_shard_cache.py
index 6634a3301..7500307db 100644
--- a/tests/test_shard_cache.py
+++ b/tests/test_shard_cache.py
@@ -73,9 +73,8 @@ def test_cache_simple(shards_to_read_at_once):
             tmpdir,
             SimpleShardSource(),
             TestProcessor(),
-            randomize_shards=False,
             await_finished=True,
-            shards_to_read_at_once=shards_to_read_at_once,
+            # shards_to_read_at_once=shards_to_read_at_once,
         )
 
         simple_processed = simple_process(TestProcessor(), SimpleShardSource())
@@ -135,33 +134,23 @@ def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[List[int]]:
     with tempfile.TemporaryDirectory() as tmpdir, tempfile.TemporaryDirectory() as tmpdir2:
         source = CrashingShardSource(4)
         with pytest.raises(_CustomException):
-            build_or_load_cache(tmpdir, source, TestProcessor(), randomize_shards=True)
+            build_or_load_cache(tmpdir, source, TestProcessor())
 
         # kill the broker actor so that we can test recovery
-        ray.kill(
-            _get_broker_actor(tmpdir, source, TestProcessor(), randomize_shards=True, shards_to_read_at_once=32),
-            no_restart=True,
-        )
+        ray.kill(_get_broker_actor(tmpdir, source, TestProcessor()), no_restart=True)
 
         source = CrashingShardSource(5)
         with pytest.raises(_CustomException):
-            build_or_load_cache(tmpdir, source, TestProcessor(), randomize_shards=True)
+            build_or_load_cache(tmpdir, source, TestProcessor())
 
-        ray.kill(
-            _get_broker_actor(tmpdir, source, TestProcessor(), randomize_shards=True, shards_to_read_at_once=32),
-            no_restart=True,
-        )
+        ray.kill(_get_broker_actor(tmpdir, source, TestProcessor()), no_restart=True)
 
         # testing this doesn't throw
         source = CrashingShardSource(1000)
-        reader1 = build_or_load_cache(
-            tmpdir, source, TestProcessor(), batch_size=1, await_finished=True, randomize_shards=True
-        )
+        reader1 = build_or_load_cache(tmpdir, source, TestProcessor(), batch_size=1, await_finished=True)
 
         # compare to the original with no crash
-        reader2 = build_or_load_cache(
-            tmpdir2, SimpleShardSource(), TestProcessor(), batch_size=1, await_finished=True, randomize_shards=True
-        )
+        reader2 = build_or_load_cache(tmpdir2, SimpleShardSource(), TestProcessor(), batch_size=1, await_finished=True)
 
         assert list(reader1) == list(reader2)
         assert len(list(reader1)) == 40
@@ -203,7 +192,6 @@ def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[List[int]]:
             batch_size=1,
             rows_per_chunk=10,
             await_finished=False,
-            randomize_shards=False,
         )
 
         # now block until the cache is done
@@ -309,7 +297,7 @@ def test_map_batches_and_map_shard_cache():
             .map(lambda list: list * 2)
             .map_batches(TestProcessor(), 8)
             .map(lambda d: {"q": d["test"]})
-            .build_or_load_cache(tmpdir, await_finished=True, randomize_shards=False)
+            .build_or_load_cache(tmpdir, await_finished=True)
         )
 
         def composite_fn(list):
diff --git a/tests/test_tokenized_document_cache.py b/tests/test_tokenized_document_cache.py
index 9c8e83db4..d3b452937 100644
--- a/tests/test_tokenized_document_cache.py
+++ b/tests/test_tokenized_document_cache.py
@@ -89,9 +89,7 @@ def open_shard_at_row(self, shard_name: str, row: int):
     source = OneDocPerShardSource(docs)
 
     with tempfile.TemporaryDirectory() as tmpdir:
-        build_or_load_cache(
-            f"{tmpdir}/cache", source, IdentityProcessor(), await_finished=True, randomize_shards=False
-        )
+        build_or_load_cache(f"{tmpdir}/cache", source, IdentityProcessor(), await_finished=True)
         cache = TokenizedDocumentCache.load(f"{tmpdir}/cache", flatten_docs=False)
 
         result = list(cache)
@@ -115,7 +113,7 @@ def batch_docs(doc_ids):
 
     source = ShardsDataset([[b] for b in batches])
     with tempfile.TemporaryDirectory() as tmpdir:
-        build_or_load_cache(f"{tmpdir}/cache", source, IdentityProcessor(), randomize_shards=False)
+        build_or_load_cache(f"{tmpdir}/cache", source, IdentityProcessor())
         cache = TokenizedDocumentCache.load(f"{tmpdir}/cache", flatten_docs=True)
 
         result = list(cache)
@@ -153,7 +151,7 @@ def doc_i(i: int):
 
     with tempfile.TemporaryDirectory() as tmpdir:
         source = ShardsDataset(doc_shards)
-        build_or_load_cache(f"{tmpdir}/cache", source, IdentityProcessor(), randomize_shards=False)
+        build_or_load_cache(f"{tmpdir}/cache", source, IdentityProcessor())
 
         # must evenly divide num_shards
         num_shards_rebuild = [1, 2, 3, 4, 6, 12]

From c17f6530b0d0b698ff993e9aa8f8e59e336de561 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Thu, 25 Jul 2024 20:36:27 -0700
Subject: [PATCH 11/94] log run_progress for a special x axis. Fixes #671
 (#674)

---
 src/levanter/callbacks.py | 12 +++++++++---
 src/levanter/trainer.py   |  2 +-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/levanter/callbacks.py b/src/levanter/callbacks.py
index 3566954ed..406a7b39a 100644
--- a/src/levanter/callbacks.py
+++ b/src/levanter/callbacks.py
@@ -91,9 +91,15 @@ def compute_loss(info: StepInfo):
     return compute_loss
 
 
-def log_step_info(step: StepInfo):
-    levanter.tracker.log_metrics({"train/loss": step.loss, "global_step": step.step}, step=step.step)
-    log_optimizer_hyperparams(step.opt_state, step=step.step, prefix="optim")
+def log_step_info(total_steps: Optional[int]):
+    def log_step_info_inner(step: StepInfo):
+        metrics = {"train/loss": step.loss, "global_step": step.step}
+        if total_steps:
+            metrics["run_progress"] = step.step / total_steps
+        log_optimizer_hyperparams(step.opt_state, step=step.step, prefix="optim")
+        levanter.tracker.log_metrics(metrics, step=step.step)
+
+    return log_step_info_inner
 
 
 def wandb_xla_logger(config: WandbConfig):
diff --git a/src/levanter/trainer.py b/src/levanter/trainer.py
index 1aaf8145d..14c1e6f96 100644
--- a/src/levanter/trainer.py
+++ b/src/levanter/trainer.py
@@ -413,7 +413,7 @@ def _add_default_hooks(self):
         from levanter import callbacks
 
         self.add_hook(callbacks.pbar_logger(total=self.config.num_train_steps), every=1)
-        self.add_hook(callbacks.log_step_info, every=1)
+        self.add_hook(callbacks.log_step_info(self.config.num_train_steps), every=1)
         # engine.add_hook(callbacks.log_memory_usage(), every=1)
         checkpointer = self.config.checkpointer.create(self.run_id)
         self.add_hook(checkpointer.on_step, every=1)  # checkpointer manages its own frequency

From ac0882dcda59759a58878cc99587e2e5cdb21e9c Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Sat, 27 Jul 2024 23:15:08 -0700
Subject: [PATCH 12/94] refactor trainer to always need a loss function, add
 z_loss (#672)

---
 examples/alpaca-lora/alpaca_lora.py |  4 +-
 examples/alpaca/alpaca.py           |  4 +-
 examples/gsm8k-lora/gsm8k_lora.py   |  4 +-
 src/levanter/doremi.py              |  6 +--
 src/levanter/eval.py                |  6 +--
 src/levanter/main/doremi_lm.py      |  5 ++-
 src/levanter/main/eval_lm.py        |  4 +-
 src/levanter/main/lora_lm.py        |  3 +-
 src/levanter/main/train_asr.py      | 15 +++++++-
 src/levanter/main/train_lm.py       |  8 +++-
 src/levanter/main/viz_logprobs.py   |  4 +-
 src/levanter/models/lm_model.py     | 59 +++++++++++++++++------------
 src/levanter/models/loss.py         | 30 ++++++++++++---
 src/levanter/trainer.py             | 12 ++----
 src/levanter/types.py               | 16 --------
 15 files changed, 102 insertions(+), 78 deletions(-)

diff --git a/examples/alpaca-lora/alpaca_lora.py b/examples/alpaca-lora/alpaca_lora.py
index 0e5754910..de6e1f059 100644
--- a/examples/alpaca-lora/alpaca_lora.py
+++ b/examples/alpaca-lora/alpaca_lora.py
@@ -21,7 +21,7 @@
     save_merged_hf_checkpoint_callback,
     save_peft_checkpoint_callback,
 )
-from levanter.models.lm_model import LmHeadModel
+from levanter.models.lm_model import LmHeadModel, compute_next_token_loss
 from levanter.trainer import Trainer
 from levanter.utils.jax_utils import parameter_count
 from levanter.utils.py_utils import non_caching_cycle
@@ -82,7 +82,7 @@ def train(config: TrainArgs):
 
     # end major difference from Alpaca
 
-    with Trainer(config.trainer, optimizer) as trainer:
+    with Trainer(config.trainer, optimizer, loss_fn=compute_next_token_loss) as trainer:  # type: ignore
         # how we shard parameters across devices
         parameter_axis_mapping = config.trainer.parameter_axis_mapping
 
diff --git a/examples/alpaca/alpaca.py b/examples/alpaca/alpaca.py
index 164759912..0ecf78e6e 100644
--- a/examples/alpaca/alpaca.py
+++ b/examples/alpaca/alpaca.py
@@ -17,7 +17,7 @@
 from levanter.compat.hf_checkpoints import HFCheckpointConverter, save_hf_checkpoint_callback
 from levanter.data import Dataset
 from levanter.data.sharded_dataset import JsonDataset, JsonlDataset, WrappedHFDataset
-from levanter.models.lm_model import LmExample, LmHeadModel
+from levanter.models.lm_model import LmExample, LmHeadModel, compute_next_token_loss
 from levanter.optim import OptimizerConfig
 from levanter.trainer import Trainer, TrainerConfig
 from levanter.utils import fsspec_utils
@@ -227,7 +227,7 @@ def train(config: TrainArgs):
 
     optimizer = config.optimizer.build(config.trainer.num_train_steps)
 
-    with Trainer(config.trainer, optimizer) as trainer:
+    with Trainer(config.trainer, optimizer, loss_fn=compute_next_token_loss) as trainer:  # type: ignore
         # how we shard parameters across devices
         parameter_axis_mapping = trainer.parameter_axis_mapping
 
diff --git a/examples/gsm8k-lora/gsm8k_lora.py b/examples/gsm8k-lora/gsm8k_lora.py
index 7364d6775..0823686e1 100644
--- a/examples/gsm8k-lora/gsm8k_lora.py
+++ b/examples/gsm8k-lora/gsm8k_lora.py
@@ -25,7 +25,7 @@
     save_peft_checkpoint_callback,
 )
 from levanter.models.llama import LlamaConfig
-from levanter.models.lm_model import LmConfig, LmExample, LmHeadModel
+from levanter.models.lm_model import LmConfig, LmExample, LmHeadModel, compute_next_token_loss
 from levanter.optim import OptimizerConfig
 from levanter.trainer import Trainer, TrainerConfig
 from levanter.utils.hf_utils import num_cpus_used_by_tokenizer
@@ -155,7 +155,7 @@ def train(config: TrainArgs):
 
     optimizer = config.optimizer.build(config.trainer.num_train_steps)
 
-    with Trainer(config.trainer, optimizer) as trainer:
+    with Trainer(config.trainer, optimizer, loss_fn=compute_next_token_loss) as trainer:  # type: ignore
         # how we shard parameters across devices
         parameter_axis_mapping = config.trainer.parameter_axis_mapping
 
diff --git a/src/levanter/doremi.py b/src/levanter/doremi.py
index f4b88bd5e..63495d709 100644
--- a/src/levanter/doremi.py
+++ b/src/levanter/doremi.py
@@ -18,7 +18,7 @@
 from levanter.data.mixture import MixtureDataset
 from levanter.tracker import capture_time
 from levanter.trainer import M, StepInfo, Trainer, TrainerConfig, TrainerState
-from levanter.types import ComputeLossFunction, ModuleComputeLoss
+from levanter.types import ComputeLossFunction
 from levanter.utils.tree_utils import inference_mode
 
 
@@ -53,6 +53,7 @@ class DoReMiConfig:
 
 
 def estimate_mixture_weights(
+    loss_fn: ComputeLossFunction[M, T],
     initial_proxy: M,
     ref: M,
     data_sources: dict[str, ShardableDataset[T]],
@@ -61,7 +62,6 @@ def estimate_mixture_weights(
     validation_sets: Optional[dict[str, ShardableDataset[T]]] = None,
     trainer_config: TrainerConfig = DEFAULT_DOREMI_TRAINER_CONFIG,
     optimizer: optax.GradientTransformation = optax.adamw(1e-3),
-    loss_fn: ComputeLossFunction[M, T] = ModuleComputeLoss(),
     domain_weight_step_size: float = 1.0,
     smoothing: float = 1e-3,
     key: PRNGKeyArray,
@@ -92,7 +92,7 @@ def estimate_mixture_weights(
     Domain = hax.Axis("domain", len(domain_indices))
     initial_alpha = hax.ones(Domain) / Domain.size
 
-    trainer = Trainer(trainer_config, optimizer)
+    trainer = Trainer(trainer_config, optimizer, loss_fn)
     with trainer:
         ref = _prepare_ref_model(ref, trainer_config)
 
diff --git a/src/levanter/eval.py b/src/levanter/eval.py
index 557ce2a43..6a016f1f9 100644
--- a/src/levanter/eval.py
+++ b/src/levanter/eval.py
@@ -15,7 +15,7 @@
 import levanter.tracker
 from levanter.data import Dataset, ReplicatedBatchLoader
 from levanter.logging import LoadingTimeTrackerIterator
-from levanter.models.lm_model import LmExample, LmHeadModel
+from levanter.models.lm_model import LmExample, LmHeadModel, compute_next_token_loss
 from levanter.trainer import StepInfo
 from levanter.utils.stat_utils import RunningMean
 from levanter.utils.tree_utils import inference_mode
@@ -204,9 +204,9 @@ def accum_for_batch(
                 m = self.mp.cast_to_compute(m)
             with hax.axis_mapping(axis_mapping):
                 total_mean, mean_per_tag = state
-                losses = m.compute_loss(batch, reduction=None, reduction_axis=())
+                losses = compute_next_token_loss(m, batch, reduction=None, reduction_axis=())
                 mask = batch.loss_mask  # [Batch, Token]
-                this_tokens = hax.einsum("->", mask)
+                this_tokens = hax.sum(mask)
                 this_loss = hax.einsum("->", losses, mask)  # to scalar
 
                 this_tokens_per_tag = hax.einsum("-> tag", mask, tags)
diff --git a/src/levanter/main/doremi_lm.py b/src/levanter/main/doremi_lm.py
index b15b9b8d3..42d84d54d 100644
--- a/src/levanter/main/doremi_lm.py
+++ b/src/levanter/main/doremi_lm.py
@@ -13,7 +13,7 @@
 from levanter.data.text import CausalLmDataset, LMMixtureDatasetConfig
 from levanter.doremi import DoReMiConfig, estimate_mixture_weights
 from levanter.models.gpt2 import Gpt2Config
-from levanter.models.lm_model import LmConfig, LmHeadModel
+from levanter.models.lm_model import LmConfig, LmHeadModel, compute_next_token_loss
 from levanter.optim import AdamConfig, OptimizerConfig
 from levanter.trainer import TrainerConfig
 from levanter.utils.tree_utils import inference_mode
@@ -77,6 +77,8 @@ def main(config: TrainLmConfig):
 
     parameter_axis_mapping = config.trainer.parameter_axis_mapping
 
+    loss_function = compute_next_token_loss
+
     with config.trainer.device_mesh:
         vocab_size = len(tokenizer)
         Vocab = round_axis_for_partitioning(Axis("vocab", vocab_size), parameter_axis_mapping)
@@ -119,6 +121,7 @@ def init_proxy_model():
         }
 
         mixture_weights = estimate_mixture_weights(
+            loss_function,
             proxy_model,
             ref=ref_model,
             data_sources=train_datasets,
diff --git a/src/levanter/main/eval_lm.py b/src/levanter/main/eval_lm.py
index 09148c7e1..6d92c717a 100644
--- a/src/levanter/main/eval_lm.py
+++ b/src/levanter/main/eval_lm.py
@@ -19,7 +19,7 @@
 from levanter.data import ReplicatedBatchLoader
 from levanter.data.text import CausalLmDataset, LMDatasetConfig
 from levanter.models.gpt2 import Gpt2Config
-from levanter.models.lm_model import LmConfig, LmExample, LmHeadModel
+from levanter.models.lm_model import LmConfig, LmExample, LmHeadModel, compute_next_token_loss
 from levanter.trainer import TrainerConfig
 from levanter.utils.jax_utils import use_cpu_device
 from levanter.utils.tree_utils import inference_mode
@@ -75,7 +75,7 @@ def main(config: EvalLmConfig):
         def compute_loss(model: LmHeadModel, example: LmExample):
             model = inference_mode(model, True)
             model = mp.cast_to_compute(model)
-            return model.compute_loss(example, key=None)
+            return compute_next_token_loss(model, example, key=None)
 
         total = config.trainer.max_eval_batches
 
diff --git a/src/levanter/main/lora_lm.py b/src/levanter/main/lora_lm.py
index 8ce5522d1..d3526a97f 100644
--- a/src/levanter/main/lora_lm.py
+++ b/src/levanter/main/lora_lm.py
@@ -18,6 +18,7 @@
     save_merged_hf_checkpoint_callback,
     save_peft_checkpoint_callback,
 )
+from levanter.models.lm_model import compute_next_token_loss
 from levanter.optim import AdamConfig, OptimizerConfig
 from levanter.trainer import Trainer, TrainerConfig
 from levanter.utils.jax_utils import parameter_count
@@ -75,7 +76,7 @@ def main(config: LoraLmConfig):
 
     optimizer = config.optimizer.build(config.trainer.num_train_steps)
 
-    with Trainer(config.trainer, optimizer) as trainer:
+    with Trainer(config.trainer, optimizer, loss_fn=compute_next_token_loss) as trainer:  # type: ignore
         # how we shard parameters across devices
         parameter_axis_mapping = config.trainer.parameter_axis_mapping
 
diff --git a/src/levanter/main/train_asr.py b/src/levanter/main/train_asr.py
index 86b5fea8c..82d8dd601 100644
--- a/src/levanter/main/train_asr.py
+++ b/src/levanter/main/train_asr.py
@@ -4,6 +4,7 @@
 from dataclasses import dataclass, field
 from typing import Optional, Union
 
+import jax
 import jax.random as jrandom
 
 import haliax as hax
@@ -14,7 +15,7 @@
 from levanter import callbacks
 from levanter.compat.hf_checkpoints import HFCompatConfig, ModelWithHfSerializationMixin, save_hf_checkpoint_callback
 from levanter.data.audio import AudioIODatasetConfig, AudioTextDataset
-from levanter.models.asr_model import ASRConfig
+from levanter.models.asr_model import ASRConfig, AudioTextExample
 from levanter.models.whisper import WhisperConfig
 from levanter.optim import AdamConfig, OptimizerConfig
 from levanter.trainer import Trainer, TrainerConfig
@@ -81,11 +82,21 @@ def main(config: TrainASRConfig):
     levanter.initialize(config)
     optimizer = config.optimizer.build(config.trainer.num_train_steps)
 
+    def compute_loss(
+        m,
+        example: AudioTextExample,
+        *,
+        key=None,
+        reduction: Optional[hax.ReductionFunction] = hax.mean,
+        reduction_axis: Optional[hax.AxisSelection] = None,
+    ) -> jax.numpy.ndarray | hax.NamedArray:
+        return m.compute_loss(example, key=key, reduction=reduction, reduction_axis=reduction_axis)
+
     # Using the trainer as a context manager does 3 things:
     # 1. Sets the device mesh
     # 2. Sets the axis mapping (for fsdp)
     # 3. Sets the global metrics tracker
-    with Trainer(config.trainer, optimizer) as trainer:
+    with Trainer(config.trainer, optimizer, compute_loss) as trainer:  # type: ignore
         # randomness in jax is tightly controlled by "keys" which are the states of the random number generators
         # this makes deterministic training pretty easy
         seed = config.trainer.seed
diff --git a/src/levanter/main/train_lm.py b/src/levanter/main/train_lm.py
index 00099c86f..385b6fc2b 100644
--- a/src/levanter/main/train_lm.py
+++ b/src/levanter/main/train_lm.py
@@ -1,4 +1,5 @@
 import dataclasses
+import functools
 import gc
 import logging
 import os
@@ -16,7 +17,7 @@
 from levanter.compat.hf_checkpoints import HFCompatConfig, save_hf_checkpoint_callback
 from levanter.data.text import CausalLmDataset, LMDatasetConfig, LMMixtureDatasetConfig
 from levanter.models.gpt2 import Gpt2Config
-from levanter.models.lm_model import LmConfig
+from levanter.models.lm_model import LmConfig, compute_next_token_loss
 from levanter.optim import AdamConfig, OptimizerConfig
 from levanter.trainer import Trainer, TrainerConfig
 from levanter.utils.jax_utils import parameter_count
@@ -41,6 +42,7 @@ class TrainLmConfig:
     # TODO: atm you have to at least specify a levanter model config with the same type as the hf checkpoint
 
     fcm_prob: float = 0.0  # forgetful context masking prob. recommended 0.15
+    z_loss_weight: float = 0.0
 
     hf_save_path: Optional[str] = None
     hf_upload: Optional[str] = None
@@ -82,11 +84,13 @@ def main(config: TrainLmConfig):
     levanter.initialize(config)
     optimizer = config.optimizer.build(config.trainer.num_train_steps)
 
+    loss_function = functools.partial(compute_next_token_loss, logsumexp_weight=config.z_loss_weight)
+
     # Using the trainer as a context manager does 3 things:
     # 1. Sets the device mesh
     # 2. Sets the axis mapping (for fsdp)
     # 3. Sets the global metrics tracker
-    with Trainer(config.trainer, optimizer) as trainer:
+    with Trainer(config.trainer, optimizer, loss_function) as trainer:
         # randomness in jax is tightly controlled by "keys" which are the states of the random number generators
         # this makes deterministic training pretty easy
         seed = config.trainer.seed
diff --git a/src/levanter/main/viz_logprobs.py b/src/levanter/main/viz_logprobs.py
index bf8b603b2..a95783c18 100644
--- a/src/levanter/main/viz_logprobs.py
+++ b/src/levanter/main/viz_logprobs.py
@@ -14,7 +14,7 @@
 from levanter.data import ReplicatedBatchLoader
 from levanter.data.text import CausalLmDataset, LMDatasetConfig
 from levanter.models.gpt2 import Gpt2Config
-from levanter.models.lm_model import LmConfig, LmExample, LmHeadModel
+from levanter.models.lm_model import LmConfig, LmExample, LmHeadModel, compute_next_token_loss
 from levanter.trainer import TrainerConfig
 from levanter.utils.jax_utils import use_cpu_device
 from levanter.utils.tree_utils import inference_mode
@@ -72,7 +72,7 @@ def main(config: VizGpt2Config):
         def compute_log_probs(model: LmHeadModel, example: LmExample):
             model = inference_mode(model, True)
             model = mp.cast_to_compute(model)
-            logprobs = model.compute_loss(example, reduction=None)
+            logprobs = compute_next_token_loss(model, example, reduction=None)
             # roll forward to get the loss for each predicted token
             logprobs = hax.roll(logprobs, 1, Pos)
             return logprobs.rearrange((EvalBatch, Pos)).array
diff --git a/src/levanter/models/lm_model.py b/src/levanter/models/lm_model.py
index 543c6a5ca..468f6a4a4 100644
--- a/src/levanter/models/lm_model.py
+++ b/src/levanter/models/lm_model.py
@@ -8,9 +8,9 @@
 
 import haliax as hax
 from haliax import Axis, NamedArray
-from haliax.nn import cross_entropy_loss
 
 from levanter.models.attention import AttentionMask
+from levanter.models.loss import next_token_loss
 
 
 LmConfigT = TypeVar("LmConfigT", bound="LmConfig")
@@ -113,30 +113,39 @@ def resize_vocab(self, new_size: int, key: Optional[PRNGKey] = None) -> "LmHeadM
         """
         pass
 
-    def compute_loss(
-        self,
-        example: LmExample,
-        *,
-        key=None,
-        reduction: Optional[hax.ReductionFunction] = hax.mean,
-        reduction_axis: Optional[hax.AxisSelection] = None,
-    ) -> jnp.ndarray | NamedArray:
-        """
-        Computes the cross-entropy loss for a language modeling example. If reduction is not None, the loss is reduced
-        across the reduction axis (with reduction_axis=None meaning all axes). If reduction is None, the loss is not
-        reduced, and the result is a named array with axes (*batch axes, sequence_length).
-        """
-        logits = self(example.tokens, example.attn_mask, key=key)
-        # TODO: would be nice if we made the dtype configurable
-        logits = logits.astype(jnp.float32)
-        targets = hax.roll(example.tokens, -1, axis=self.Pos.name)
-        target_y = hax.nn.one_hot(targets, self.Vocab, dtype=logits.dtype)
-        loss = cross_entropy_loss(
-            logits, self.Vocab, target_y, reduction, reduction_axis=reduction_axis, where=example.loss_mask
-        )
-
-        return loss
-
     @property
     def vocab_size(self) -> int:
         return self.Vocab.size
+
+
+def compute_next_token_loss(
+    model: LmHeadModel,
+    example: LmExample,
+    *,
+    key=None,
+    reduction: Optional[hax.ReductionFunction] = hax.mean,
+    reduction_axis: Optional[hax.AxisSelection] = None,
+    logsumexp_weight: Optional[float] = None,
+    loss_dtype: Optional[Type[jnp.dtype]] = jnp.float32,
+) -> jnp.ndarray | NamedArray:
+    """
+    Computes the cross-entropy loss for a language modeling example. If reduction is not None, the loss is reduced
+    across the reduction axis (with reduction_axis=None meaning all axes). If reduction is None, the loss is not
+    reduced, and the result is a named array with axes (*batch axes, sequence_length).
+    """
+    logits = model(example.tokens, example.attn_mask, key=key)
+    if loss_dtype is not None:
+        logits = logits.astype(loss_dtype)
+
+    loss = next_token_loss(
+        model.Pos,
+        model.Vocab,
+        logits,
+        example.tokens,
+        loss_mask=example.loss_mask,
+        reduction=reduction,
+        reduction_axis=reduction_axis,
+        logsumexp_weight=logsumexp_weight,
+    )
+
+    return loss
diff --git a/src/levanter/models/loss.py b/src/levanter/models/loss.py
index 65d1441ee..1ef7e81f9 100644
--- a/src/levanter/models/loss.py
+++ b/src/levanter/models/loss.py
@@ -4,7 +4,7 @@
 
 import haliax as hax
 from haliax import NamedArray
-from haliax.nn import cross_entropy_loss, cross_entropy_loss_and_log_normalizers
+from haliax.nn import cross_entropy_loss_and_log_normalizers
 
 
 def next_token_loss(
@@ -14,6 +14,8 @@ def next_token_loss(
     true_ids: NamedArray,
     loss_mask: Optional[NamedArray] = None,
     reduction: Optional[hax.ReductionFunction] = hax.mean,
+    reduction_axis: Optional[hax.AxisSelection] = None,
+    logsumexp_weight: Optional[float] = None,
 ):
     Pos, Vocab = pred_ids.resolve_axis((Pos, Vocab))
     # need to roll the target tokens back by one so that each token is predicting the next token
@@ -27,16 +29,32 @@ def next_token_loss(
     else:
         loss_mask = not_last_loss_mask
 
-    return cross_entropy_loss(pred_ids, Vocab, target_y, reduction=reduction, where=loss_mask, reduction_axis=Pos)
+    return cross_entropy_and_logsumexp_penalty(
+        pred_ids,
+        Vocab,
+        target_y,
+        reduction=reduction,
+        reduction_axis=reduction_axis,
+        where=loss_mask,
+        logsumexp_weight=logsumexp_weight,
+    )
 
 
 def cross_entropy_and_logsumexp_penalty(
-    pred_y: NamedArray, Vocab: hax.Axis, target_y: NamedArray, logsumexp_weight=0.0
+    pred_y: NamedArray,
+    Vocab: hax.Axis,
+    target_y: NamedArray,
+    *,
+    reduction: Optional[hax.ReductionFunction] = hax.mean,
+    reduction_axis: Optional[hax.AxisSelection] = None,
+    where: Optional[NamedArray] = None,
+    logsumexp_weight=0.0,
 ) -> NamedArray:
     """A loss function that combines cross entropy loss with a logsumexp penalty."""
-    if logsumexp_weight == 0.0:
-        return cross_entropy_loss(pred_y, Vocab, target_y)
 
     loss, log_normalizers = cross_entropy_loss_and_log_normalizers(pred_y, Vocab, target_y)
 
-    return loss + logsumexp_weight * (log_normalizers**2)
+    if logsumexp_weight is not None and logsumexp_weight != 0.0:
+        loss = loss + logsumexp_weight * (log_normalizers**2)
+
+    return hax.nn.loss.maybe_reduce_loss(loss, reduction, reduction_axis, where)
diff --git a/src/levanter/trainer.py b/src/levanter/trainer.py
index 14c1e6f96..ef870382b 100644
--- a/src/levanter/trainer.py
+++ b/src/levanter/trainer.py
@@ -53,7 +53,7 @@
 from levanter.grad_accum import microbatched
 from levanter.tracker import TrackerConfig, capture_time
 from levanter.trainer_state import TrainerState, saveable_training_mask
-from levanter.types import ComputeLossFunction, FilterSpec, ModuleComputeLoss
+from levanter.types import ComputeLossFunction, FilterSpec
 from levanter.utils import cloud_utils, fsspec_utils
 from levanter.utils.jax_utils import create_fsdp_mesh
 from levanter.utils.tree_utils import inference_mode
@@ -144,7 +144,7 @@ def __init__(
         self,
         config: "TrainerConfig",
         optimizer: GradientTransformation,
-        loss_fn: Optional[ComputeLossFunction] = None,
+        loss_fn: ComputeLossFunction,
         *,
         add_default_hooks: bool = True,
     ):
@@ -159,13 +159,7 @@ def __init__(
         self.hooks = TrainerHooks()
         self.config = config
         self.optimizer = optimizer
-        self._raw_loss_function = loss_fn or ModuleComputeLoss()
-        if isinstance(config.tracker, Sequence):
-            self.tracker = levanter.tracker.CompositeTracker([c.init(self.run_id) for c in config.tracker])
-        else:
-            self.tracker = config.tracker.init(self.run_id)
-
-        self._raw_loss_function = loss_fn or ModuleComputeLoss()
+        self._raw_loss_function = loss_fn
         if isinstance(config.tracker, Sequence):
             self.tracker = levanter.tracker.CompositeTracker([c.init(self.run_id) for c in config.tracker])
         else:
diff --git a/src/levanter/types.py b/src/levanter/types.py
index d77e505c0..46ccac2b5 100644
--- a/src/levanter/types.py
+++ b/src/levanter/types.py
@@ -57,19 +57,3 @@ def __call__(
         **kwargs,
     ) -> Scalar | hax.NamedArray:
         ...
-
-
-class ModuleComputeLoss(ComputeLossFunction[M, X]):
-    """
-    Loss that just delegates to the model's compute_loss method.
-    """
-
-    def __call__(
-        self,
-        model,
-        *inputs: X,
-        reduction: Optional[hax.ReductionFunction] = hax.mean,
-        reduction_axis: Optional[hax.AxisSelection] = None,
-        **kwargs,
-    ) -> Scalar | hax.NamedArray:
-        return model.compute_loss(*inputs, reduction=reduction, reduction_axis=reduction_axis, **kwargs)

From cb3638e7c8eff1624b534a06084b9ec5434c9912 Mon Sep 17 00:00:00 2001
From: Ivan Zhou <ivan.zhouyq@gmail.com>
Date: Sun, 4 Aug 2024 13:11:13 -0700
Subject: [PATCH 13/94] Specify node_count as int in launch.py (#682)

---
 infra/launch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/infra/launch.py b/infra/launch.py
index ac3b3c521..1b4569b37 100755
--- a/infra/launch.py
+++ b/infra/launch.py
@@ -202,7 +202,7 @@ def _default_run_id():
     cli.add_arg(parser, config, ["--project"], default=cli.gcloud_config()["project"])
     cli.add_arg(parser, config, ["--tpu_name"], required=True)
     cli.add_arg(parser, config, ["--tpu_type"], required=True)
-    cli.add_arg(parser, config, ["--node_count"], default=1)
+    cli.add_arg(parser, config, ["--node_count"], default=1, type=int)
     cli.add_arg(parser, config, ["--version"], default="tpu-ubuntu2204-base")
     cli.add_arg(parser, config, ["--zone"], required=True)
     cli.add_arg(parser, config, ["--retries"], default=0, type=int)

From 8111f2981f100523ef894a4345664aede1aad0c8 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 11 Aug 2024 00:17:44 -0700
Subject: [PATCH 14/94] Bump ray[default] from 2.32.0 to 2.34.0 (#683)

Bumps [ray[default]](https://github.com/ray-project/ray) from 2.32.0 to 2.34.0.
- [Release notes](https://github.com/ray-project/ray/releases)
- [Commits](https://github.com/ray-project/ray/compare/ray-2.32.0...ray-2.34.0)

---
updated-dependencies:
- dependency-name: ray[default]
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index c1cbc8427..e6da26135 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,7 +50,7 @@ dependencies = [
     "matplotlib>=3.7.0",
     "tblib>=1.7.0,<4.0.0",
     "dataclasses-json~=0.6.4",
-    "ray[default]==2.32.0",
+    "ray[default]==2.34.0",
     "pydantic<3",  # temporary pin until Ray supports pydantic 2.0
     "rich~=13.0",
     "filelock~=3.13",

From 04b0904650eac58055f52bfe7088b7930725b7c9 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Sun, 11 Aug 2024 20:02:54 -0600
Subject: [PATCH 15/94] wandb seems to be broken in latest release (#688)

* wandb seems to be broken in latest release

* oops

* what?
---
 infra/helpers/setup-tpu-vm-tests.sh | 2 +-
 infra/helpers/setup-tpu-vm.sh       | 2 +-
 pyproject.toml                      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/infra/helpers/setup-tpu-vm-tests.sh b/infra/helpers/setup-tpu-vm-tests.sh
index 4b6cf27f5..71bead17e 100755
--- a/infra/helpers/setup-tpu-vm-tests.sh
+++ b/infra/helpers/setup-tpu-vm-tests.sh
@@ -105,7 +105,7 @@ pip install -U wheel
 
 # jax and jaxlib
 # libtpu sometimes has issues installing for clinical (probably firewall?)
-retry pip install -U "jax[tpu]==0.4.26" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+retry pip install -U "jax[tpu]==0.4.31" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
 
 # clone levanter
 git clone $REPO levanter
diff --git a/infra/helpers/setup-tpu-vm.sh b/infra/helpers/setup-tpu-vm.sh
index f40b4e693..f80e586bb 100755
--- a/infra/helpers/setup-tpu-vm.sh
+++ b/infra/helpers/setup-tpu-vm.sh
@@ -106,7 +106,7 @@ pip install -U wheel
 # jax and jaxlib
 # libtpu sometimes has issues installing for clinical (probably firewall?)
 #retry pip install -U "jax[tpu]==0.4.5" libtpu-nightly==0.1.dev20230216 -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
-retry pip install -U "jax[tpu]==0.4.26" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+retry pip install -U "jax[tpu]==0.4.31" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
 
 # clone levanter
 git clone $REPO levanter
diff --git a/pyproject.toml b/pyproject.toml
index e6da26135..7182b0c4e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ dependencies = [
     "tokenizers>=0.15.2",
     "transformers>=4.41.2",
     "optax>=0.1.9",
-    "wandb>=0.16.6,<0.18.0",
+    "wandb>=0.16.6,<0.17.6",
     # We don't actually directly depend on scipy, but recent JAX had an issue
     "scipy<=1.12.0",
     "draccus>=0.8.0",

From 8c10a7a455efc971e73a2dd7e4ff11c709959342 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Tue, 13 Aug 2024 17:42:29 -0700
Subject: [PATCH 16/94] switch to setup tools and forget the config thing
 (#691)

---
 pyproject.toml | 40 +++++++++++-----------------------------
 1 file changed, 11 insertions(+), 29 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7182b0c4e..5cdd9718b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,13 +1,13 @@
 [build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
 
 [project]
 name = "levanter"
 version = "1.1"
 authors = [
   { name="David Hall", email="dlwh@cs.stanford.edu" },
-  {name="Ivan Zhou", email="ivanz@stanford.edu"}
+  { name="Ivan Zhou", email="ivanz@stanford.edu" }
 ]
 description = "Scalable Training for Foundation Models with Named Tensors and JAX"
 readme = "README.md"
@@ -21,11 +21,6 @@ classifiers = [
     "Intended Audience :: Science/Research",
 ]
 dependencies = [
-    # we require that you install jax yourself, since the extras vary by system.
-    # jax = {version = ">=0.4.10,<0.5.0"}
-#    "haliax>=1.3,<2.0",
-#     Haliax changes in step with levanter, so we'll just use the git version except for releases.
-#    "haliax @ git+https://github.com/stanford-crfm/haliax.git@main",
     "haliax>=1.4.dev307",
     "equinox>=0.11.4",
     "jaxtyping>=0.2.20",
@@ -33,7 +28,6 @@ dependencies = [
     "transformers>=4.41.2",
     "optax>=0.1.9",
     "wandb>=0.16.6,<0.17.6",
-    # We don't actually directly depend on scipy, but recent JAX had an issue
     "scipy<=1.12.0",
     "draccus>=0.8.0",
     "pyarrow>=11.0.0",
@@ -51,31 +45,16 @@ dependencies = [
     "tblib>=1.7.0,<4.0.0",
     "dataclasses-json~=0.6.4",
     "ray[default]==2.34.0",
-    "pydantic<3",  # temporary pin until Ray supports pydantic 2.0
+    "pydantic<3",
     "rich~=13.0",
     "filelock~=3.13",
+#    "ai2-olmo",
 ]
 
-[tool.hatch.build]
-include = ["config/*.yaml", "config/*/*.yaml", "*.py"]
-dev-mode-dirs = [".", "src"]
-
-[tool.hatch.build.sources]
-"src/levanter" = "levanter"
-"config" = "levanter/config"
-
-[tool.hatch.metadata]
-allow-direct-references = true
-
-[tool.hatch.build.targets.wheel]
-packages = ["levanter"]
-
-
 [project.urls]
 "Homepage" = "https://github.com/stanford-crfm/levanter"
 "Bug Tracker" = "https://github.com/stanford-crfm/levanter/issues"
 
-
 [tool.black]
 line-length = 119
 target-version = ["py310"]
@@ -92,7 +71,7 @@ ensure_newline_before_comments = true
 line_length = 119
 src_paths = ["src", "tests"]
 known_haliax = ["haliax"]
-sections=["FUTURE", "STDLIB", "THIRDPARTY", "HALIAX", "FIRSTPARTY", "LOCALFOLDER"]
+sections = ["FUTURE", "STDLIB", "THIRDPARTY", "HALIAX", "FIRSTPARTY", "LOCALFOLDER"]
 
 [tool.mypy]
 python_version = "3.10"
@@ -102,7 +81,7 @@ mypy_path = ["src"]
 ignore_missing_imports = true
 
 [tool.pytest.ini_options]
-pythonpath = [ "tests" ]
+pythonpath = ["src", "tests"]
 markers = [
     "slow: marks tests as slow (deselect with '-m \"not slow\"')",
     "entry: marks tests as entry point tests (deselect with '-m \"not entry\"')",
@@ -113,9 +92,12 @@ markers = [
 test = [
     "pytest",
     "flake8",
-    "pytest",
     "soundfile",
     "librosa",
     "pytest-forked",
     "pytest-asyncio"
 ]
+
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["levanter", "levanter.*"]

From e8b600318a81918168f6a5f4022f8c15aafe8547 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Tue, 13 Aug 2024 21:00:48 -0700
Subject: [PATCH 17/94] set logging level to INFO

---
 src/levanter/data/shard_cache.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/levanter/data/shard_cache.py b/src/levanter/data/shard_cache.py
index f27008e45..8956412b5 100644
--- a/src/levanter/data/shard_cache.py
+++ b/src/levanter/data/shard_cache.py
@@ -53,7 +53,7 @@
 LEDGER_FILE_NAME = "cache_ledger.json"
 
 LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-LEVEL_TO_LOG = pylogging.DEBUG
+LEVEL_TO_LOG = pylogging.INFO
 
 
 def build_or_load_cache(

From 441af5cc54c47d1db1ed0ed23d4903dc5e32a933 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Wed, 14 Aug 2024 16:58:02 -0700
Subject: [PATCH 18/94] update docker image, build it in ci, make the args
 point to the new version (#693)

---
 .github/workflows/docker-base-image.yaml | 40 ++++++++++++++++++++++++
 docker/tpu/Dockerfile.base               |  6 ++--
 docker/tpu/Dockerfile.incremental        |  4 ++-
 infra/launch.py                          |  2 +-
 4 files changed, 48 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/docker-base-image.yaml

diff --git a/.github/workflows/docker-base-image.yaml b/.github/workflows/docker-base-image.yaml
new file mode 100644
index 000000000..de7f297f4
--- /dev/null
+++ b/.github/workflows/docker-base-image.yaml
@@ -0,0 +1,40 @@
+name: Build and Push Docker TPU Base Image
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+
+      - name: Cache Docker layers
+        uses: actions/cache@v3
+        with:
+          path: /tmp/.buildx-cache
+          key: ${{ runner.os }}-buildx-${{ github.sha }}
+          restore-keys: |
+            ${{ runner.os }}-buildx-
+
+      - name: Get current date
+        id: date
+        run: echo "DATE=$(date +'%Y%m%d')" >> $GITHUB_ENV
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.DOCKER_PUSH_TOKEN }}
+
+      - name: Build and Push Docker image
+        run: |
+          docker buildx build --file docker/tpu/Dockerfile.base --tag ghcr.io/${{ github.repository_owner }}/levanter-base:latest --tag ghcr.io/${{ github.repository_owner }}/levanter-base:${{ env.DATE }} --push .
diff --git a/docker/tpu/Dockerfile.base b/docker/tpu/Dockerfile.base
index dec775dfd..958fde8b7 100644
--- a/docker/tpu/Dockerfile.base
+++ b/docker/tpu/Dockerfile.base
@@ -5,12 +5,14 @@ RUN pip install virtualenv
 # venv binaries encode their directory, so we need to setup the venv in the final location
 RUN virtualenv -p python3.10 /opt/levanter/.venv
 ENV PATH /opt/levanter/.venv/bin:$PATH
-RUN /opt/levanter/.venv/bin/pip install -U hatch "jax[tpu]==0.4.26" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+RUN /opt/levanter/.venv/bin/pip install -U "jax[tpu]==0.4.30" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
 
 # Install package dependencies to make incremental builds faster.
 WORKDIR /tmp/
 ADD pyproject.toml README.md /tmp/
-RUN pip install $(hatch dep show requirements --all)
+# work around setuptools bug
+RUN mkdir -p /tmp/src
+RUN pip install .[test]
 
 FROM python:3.10
 
diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental
index fb83b65c3..ad75c5d42 100644
--- a/docker/tpu/Dockerfile.incremental
+++ b/docker/tpu/Dockerfile.incremental
@@ -1,4 +1,4 @@
-ARG IMAGE=ghcr.io/rjpower/levanter
+ARG IMAGE=ghcr.io/stanford-crfm/levanter-base
 ARG TAG=latest
 
 FROM ${IMAGE}:${TAG}
@@ -12,6 +12,8 @@ ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\
 
 WORKDIR /opt/levanter
 
+# We have to mkdir src/ to avoid setuptools error
+RUN mkdir -p /opt/levanter/src
 ADD pyproject.toml README.md /opt/levanter/
 RUN pip install -e '.[test]'
 ADD . /opt/levanter
diff --git a/infra/launch.py b/infra/launch.py
index 1b4569b37..7ccdd24a3 100755
--- a/infra/launch.py
+++ b/infra/launch.py
@@ -175,7 +175,7 @@ def _default_run_id():
     cli.add_arg(
         parser, config, ["--autodelete"], default=False, action="store_true", help="Delete TPU if it already exists."
     )
-    cli.add_arg(parser, config, ["--docker_base_image"], default="ghcr.io/rjpower/levanter:latest")
+    cli.add_arg(parser, config, ["--docker_base_image"], default="ghcr.io/stanford-crfm/levanter-base:latest")
     cli.add_arg(parser, config, ["--docker_repository"], default="levanter")
     cli.add_arg(parser, config, ["--foreground"], default=False, action="store_true")
     cli.add_arg(parser, config, ["--image_name"], default=f"levanter-{getpass.getuser()}")

From ef6349c7bfaccfb7d591886b327ce7fee02d2402 Mon Sep 17 00:00:00 2001
From: Jason Wang <blahblahj.wsy@gmail.com>
Date: Thu, 15 Aug 2024 09:54:26 -0700
Subject: [PATCH 19/94] RE-Allow adding extrenal directory to docker image
 (#695)

* add mounting dir

* minor fix

* support abs and rel path

* add docs

* refactor to extra context

* minor fix docs

* minor fix

* modify docs
---
 docker/tpu/Dockerfile.incremental |  7 +++++
 docs/Getting-Started-TPU-VM.md    |  7 +++++
 infra/launch.py                   |  5 ++++
 infra/push_docker.py              | 45 +++++++++++++++++++++++++++----
 4 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental
index ad75c5d42..d741674f3 100644
--- a/docker/tpu/Dockerfile.incremental
+++ b/docker/tpu/Dockerfile.incremental
@@ -3,6 +3,9 @@ ARG TAG=latest
 
 FROM ${IMAGE}:${TAG}
 
+# This usually is a config directory so users can have their own config directory outside the repo.
+ARG EXTRA_CTX=/config
+
 ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\
     TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024\
     RAY_USAGE_STATS_ENABLED=0\
@@ -17,3 +20,7 @@ RUN mkdir -p /opt/levanter/src
 ADD pyproject.toml README.md /opt/levanter/
 RUN pip install -e '.[test]'
 ADD . /opt/levanter
+
+# Add $EXTRA_CTX to the same location as in local machine.
+# so that the same (config) path(s) specified in train_lm.py argument still works
+COPY .mnt $EXTRA_CTX
diff --git a/docs/Getting-Started-TPU-VM.md b/docs/Getting-Started-TPU-VM.md
index f13e98541..b4963fbde 100644
--- a/docs/Getting-Started-TPU-VM.md
+++ b/docs/Getting-Started-TPU-VM.md
@@ -138,6 +138,13 @@ To run in the foreground, use `--foreground` with the `launch.py` script. You sh
 python infra/launch.py -- python src/levanter/main/train_lm.py --config_path config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
 ```
 
+### Using external directory/file
+
+In case that you want to reference some external directory/file outside of the levanter repo, you can do it by adding the external directory/file to the docker image so that it becomes accessible in TPU instances. You can specify the path you want to add as extra buildl context by `--extra_context` with the `launch.py` script. Then, you should be able to use the external files in arguments in `train_lm.py` etc.
+```bash
+python infra/launch.py --extra_context <external path> -- python src/levanter/main/train_lm.py --config_path <external path> --trainer.checkpointer.base_path gs://<somewhere>'
+```
+
 ### Babysitting Script
 
 If you are using a preemptible TPU VM, you probably want to use the "babysitting" script that automatically re-creates
diff --git a/infra/launch.py b/infra/launch.py
index 7ccdd24a3..4f49689e1 100755
--- a/infra/launch.py
+++ b/infra/launch.py
@@ -7,6 +7,7 @@
 import os
 import subprocess
 import time
+from pathlib import Path
 
 from infra import push_docker
 from infra.helpers import cli
@@ -210,6 +211,7 @@ def _default_run_id():
     cli.add_arg(parser, config, ["--docker_registry"], default="gcp", choices=["gcp", "ghcr"])
     cli.add_arg(parser, config, ["--github_user"], type=str)
     cli.add_arg(parser, config, ["--github_token"], type=str)
+    cli.add_arg(parser, config, ["--extra_context"], type=Path, default=Path("config"))
 
     parser.add_argument(
         "-e", "--env", action="append", nargs=2, metavar=("KEY", "VALUE"), default=list(config.get("env", {}).items())
@@ -239,6 +241,7 @@ def _default_run_id():
     registry = args.docker_registry
     github_user = args.github_user
     github_token = args.github_token
+    extra_context = args.extra_context
 
     region = "-".join(zone.split("-")[:-1])
     env = {k: v for k, v in args.env}
@@ -259,6 +262,7 @@ def _default_run_id():
             github_user=github_user,
             github_token=github_token,
             docker_file="docker/tpu/Dockerfile.incremental",
+            extra_context=extra_context,
         )
     elif registry == "gcp":
         full_image_id = push_docker.push_to_gcp(
@@ -268,6 +272,7 @@ def _default_run_id():
             image_name=image_id,
             tag=tag,
             docker_file="docker/tpu/Dockerfile.incremental",
+            extra_context=extra_context,
         )
     else:
         raise ValueError(f"Unknown docker registry: {args.docker_registry}")
diff --git a/infra/push_docker.py b/infra/push_docker.py
index 450bae268..181b5bf07 100644
--- a/infra/push_docker.py
+++ b/infra/push_docker.py
@@ -9,9 +9,12 @@
 
 import argparse
 import json
+import os
 import pty
+import shutil
 import subprocess
 import sys
+from pathlib import Path
 
 from infra.helpers import cli
 
@@ -35,6 +38,27 @@
 ]
 
 
+def _rm(path):
+    if path.is_dir():
+        shutil.rmtree(path, ignore_errors=True)
+    elif path.is_file():
+        os.remove(path)
+    elif path.exists():
+        raise RuntimeError(f"Remove failed. Path ({path}) is neither a directory nor a file.")
+
+
+def _cp(src, dst):
+    # delete dst if exists
+    _rm(dst)
+
+    if src.is_dir():
+        shutil.copytree(src, dst)
+    elif src.is_file():
+        shutil.copy(src, dst)
+    else:
+        raise RuntimeError(f"Copy failed. Source path ({src}) is neither a directory nor a file. Check if it exists.")
+
+
 def _run(argv):
     if sys.stdout.isatty():
         exit_code = pty.spawn(argv)
@@ -128,14 +152,22 @@ def configure_gcp_docker(project_id, region, repository):
     _run(["gcloud", "auth", "configure-docker", "--quiet", f"{region}-docker.pkg.dev"])
 
 
-def build_docker(docker_file, image_name, tag) -> str:
+def build_docker(docker_file, image_name, tag, mount_src) -> str:
     """Builds a Docker image, enables artifact access, and pushes to Artifact Registry."""
+    # Copy external files temporarily to .mnt
+    mount_dst = Path(".mnt")
+    _cp(mount_src, mount_dst)
 
+    # Get mounting path in docker image.
+    levanter_path = Path("/opt/levanter")
+    extra_context = levanter_path / mount_src
     _run(
         [
             "docker",
             "buildx",
             "build",
+            "--build-arg",
+            f"EXTRA_CTX={extra_context.resolve()}",
             "--platform=linux/amd64",
             "-t",
             f"{image_name}:{tag}",
@@ -144,12 +176,14 @@ def build_docker(docker_file, image_name, tag) -> str:
             ".",
         ]
     )
+    # clean up after building
+    _rm(mount_dst)
 
     return f"{image_name}:{tag}"
 
 
 # Disabled until we can figure out how Docker hub organizations work
-def push_to_github(local_image, tag, github_user=None, github_token=None, docker_file=None):
+def push_to_github(local_image, tag, github_user=None, github_token=None, docker_file=None, extra_context=None):
     """Pushes a local Docker image to Docker Hub."""
 
     # Authenticate the docker service with Github if a token exists
@@ -160,17 +194,17 @@ def push_to_github(local_image, tag, github_user=None, github_token=None, docker
         print(login_process.communicate(input=github_token.encode(), timeout=10))
 
     remote_name = f"ghcr.io/{github_user}/{local_image}:{tag}"
-    local_name = build_docker(docker_file=docker_file, image_name=local_image, tag=tag)
+    local_name = build_docker(docker_file=docker_file, image_name=local_image, tag=tag, mount_src=extra_context)
 
     _run(["docker", "tag", local_name, remote_name])
     _run(["docker", "push", remote_name])
     return remote_name
 
 
-def push_to_gcp(project_id, region, repository, image_name, tag, docker_file) -> str:
+def push_to_gcp(project_id, region, repository, image_name, tag, docker_file, extra_context) -> str:
     """Pushes a local Docker image to Artifact Registry."""
     configure_gcp_docker(project_id, region, repository)
-    local_image = build_docker(docker_file=docker_file, image_name=image_name, tag=tag)
+    local_image = build_docker(docker_file=docker_file, image_name=image_name, tag=tag, mount_src=extra_context)
 
     artifact_repo = f"{region}-docker.pkg.dev/{project_id}/{repository}"
 
@@ -214,4 +248,5 @@ def push_to_gcp(project_id, region, repository, image_name, tag, docker_file) ->
             args.image,
             args.tag,
             docker_file=args.docker_file,
+            extra_context=Path("config"),
         )

From c9ebc8891300b16d3c1ebee2d4bc2c7d7abfd784 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Tue, 20 Aug 2024 15:09:28 -0700
Subject: [PATCH 20/94] match specs in dclm

---
 config/llama_7b_with_dclm.yaml | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/config/llama_7b_with_dclm.yaml b/config/llama_7b_with_dclm.yaml
index 9703fc2d5..e1bea2225 100644
--- a/config/llama_7b_with_dclm.yaml
+++ b/config/llama_7b_with_dclm.yaml
@@ -8,22 +8,24 @@ model:  # 7B class model
   num_heads: 32
   num_kv_heads: 32
   use_flash_attention: True
-  flash_attention_block_size: 1024
 trainer:
   tracker:
     type: wandb
+    entity: "stanford-crfm"
     project: "marin"
     tags: ["dclm", "7B", "llama"]
 
   mp: p=f32,c=bfloat16
   train_batch_size: 2048
-  num_train_steps: 750000  # 3,000,000,000,000 / 4,000,000 = 750,000
+  num_train_steps: 69000  # 276e9 / 4M
   steps_per_eval: 1000
   tensor_parallel_axes: ["mlp", "heads"]
   fsdp_axis: "embed"
   batch_axis: "batch"
 optimizer:
-  learning_rate: 4E-4
-  weight_decay: 0.1
+  learning_rate: 2e-3
+  weight_decay: 0.05
   min_lr_ratio: 0.1
-  warmup: 0.01
+  warmup: 5000
+
+z_loss_weight: 5e-6

From 7727696ba918d21f7b11a8c9d2fe888256f6bc5c Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Wed, 21 Aug 2024 14:01:13 -0700
Subject: [PATCH 21/94] publish dev build

---
 .github/workflows/publish_dev.yaml | 67 ++++++++++++++++++++++++++++++
 src/levanter/__init__.py           |  3 ++
 2 files changed, 70 insertions(+)
 create mode 100644 .github/workflows/publish_dev.yaml

diff --git a/.github/workflows/publish_dev.yaml b/.github/workflows/publish_dev.yaml
new file mode 100644
index 000000000..7ddcd1d2d
--- /dev/null
+++ b/.github/workflows/publish_dev.yaml
@@ -0,0 +1,67 @@
+name: Publish Dev Build
+
+on:
+  workflow_run:
+    workflows: ["Run Tests"]
+    types:
+      - completed
+    branches: [main]
+  workflow_dispatch:
+
+jobs:
+  build-package:
+    runs-on: ubuntu-latest
+    if: ${{  github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success'}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.x'
+
+      - name: Calculate Version and Build Number
+        run: |
+          PROJECT_VERSION=$(sed -n 's/__version__ = "\(.*\)"/\1/p' src/levanter/__init__.py)
+          BUILD_NUMBER=$(git rev-list --count HEAD)
+          FULL_VERSION="${PROJECT_VERSION}.dev${BUILD_NUMBER}"
+          echo "FULL_VERSION=${FULL_VERSION}" >> $GITHUB_ENV
+          echo "Calculated version with build number: $FULL_VERSION"
+      - name: Update pyproject.toml version
+        run: |
+          # replace the version in __init__.py
+          echo "Updating version in __init__.py to $FULL_VERSION"
+          sed -i "s/__version__ = \".*\"/__version__ = \"$FULL_VERSION\"/g" src/levanter/__init__.py
+      - name: Build package
+        run: |
+          python -m pip install --upgrade pip
+          pip install build
+          python -m build
+
+      - name: Upload package
+        uses: actions/upload-artifact@v4
+        with:
+          name: package
+          path: dist/
+
+
+  # cf https://test.pypi.org/manage/project/levanter/settings/publishing/
+  publish-dev:
+    runs-on: ubuntu-latest
+    needs:
+        - build-package
+    permissions:
+      id-token: write
+    steps:
+      - name: Retrieve release distributions
+        uses: actions/download-artifact@v4
+        with:
+          name: package
+          path: dist/
+
+      - name: Publish release distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+
+
diff --git a/src/levanter/__init__.py b/src/levanter/__init__.py
index 093c8b545..2674d5bd6 100644
--- a/src/levanter/__init__.py
+++ b/src/levanter/__init__.py
@@ -11,3 +11,6 @@
 import levanter.visualization as visualization
 from levanter.tracker import current_tracker
 from levanter.trainer import initialize
+
+
+__version__ = "1.1"

From 55e4d98a58648fdc0443758ee61488e3f82d4b0b Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Wed, 21 Aug 2024 14:02:10 -0700
Subject: [PATCH 22/94] wip

---
 config/data/dclm_gpt_neo.yaml  |  1 +
 config/llama_7b_with_dclm.yaml |  8 +++++---
 infra/launch.py                |  4 ++--
 infra/push_docker.py           |  2 +-
 pyproject.toml                 | 18 +++++++++++++++---
 5 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/config/data/dclm_gpt_neo.yaml b/config/data/dclm_gpt_neo.yaml
index 36dbf69e6..fd1f6a87f 100644
--- a/config/data/dclm_gpt_neo.yaml
+++ b/config/data/dclm_gpt_neo.yaml
@@ -1,6 +1,7 @@
 cache_dir: "gs://marin-data/tokenized/dclm/gpt_neo_tokenizer"
 tokenizer: "EleutherAI/gpt-neox-20b"
 stop_strategy: restart
+shuffle_buffer_size: 100000
 configs:
   "dclm":
     train_urls:
diff --git a/config/llama_7b_with_dclm.yaml b/config/llama_7b_with_dclm.yaml
index e1bea2225..9d30e9917 100644
--- a/config/llama_7b_with_dclm.yaml
+++ b/config/llama_7b_with_dclm.yaml
@@ -11,7 +11,7 @@ model:  # 7B class model
 trainer:
   tracker:
     type: wandb
-    entity: "stanford-crfm"
+    entity: "stanford-mercury"
     project: "marin"
     tags: ["dclm", "7B", "llama"]
 
@@ -23,9 +23,11 @@ trainer:
   fsdp_axis: "embed"
   batch_axis: "batch"
 optimizer:
-  learning_rate: 2e-3
-  weight_decay: 0.05
+  learning_rate: 4e-4
+  weight_decay: 0.1
   min_lr_ratio: 0.1
+  beta1: 0.9
+  beta2: 0.95
   warmup: 5000
 
 z_loss_weight: 5e-6
diff --git a/infra/launch.py b/infra/launch.py
index 7ccdd24a3..fe02a0b68 100755
--- a/infra/launch.py
+++ b/infra/launch.py
@@ -8,8 +8,8 @@
 import subprocess
 import time
 
-from infra import push_docker
-from infra.helpers import cli
+from . import push_docker
+from .helpers import cli
 
 
 def setup_vm_docker(tpu_name, zone, node_count, docker_base_image):
diff --git a/infra/push_docker.py b/infra/push_docker.py
index 450bae268..66e10298c 100644
--- a/infra/push_docker.py
+++ b/infra/push_docker.py
@@ -13,7 +13,7 @@
 import subprocess
 import sys
 
-from infra.helpers import cli
+from .helpers import cli
 
 
 GCP_CLEANUP_POLICY = [
diff --git a/pyproject.toml b/pyproject.toml
index 5cdd9718b..dca97dcfe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -98,6 +98,18 @@ test = [
     "pytest-asyncio"
 ]
 
-[tool.setuptools.packages.find]
-where = ["src"]
-include = ["levanter", "levanter.*"]
+#[tool.setuptools.packages.find]
+#where = ["src"]
+#include = ["levanter", "levanter.*"]
+
+
+[tool.setuptools]
+packages = ["levanter", "levanter.infra"]
+
+[tool.setuptools.package-dir]
+levanter = "src/levanter"
+"levanter.infra" = "infra"
+
+# set version from package
+[tool.setuptools.dynamic]
+version = "levanter.__version__"

From de51236e8b157cd96be426d939d62f0f8b8779d1 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Wed, 21 Aug 2024 23:23:52 -0700
Subject: [PATCH 23/94] fix imports and such

---
 docker/tpu/Dockerfile.incremental |  4 +++-
 infra/launch.py                   | 12 ++++++++++--
 pyproject.toml                    |  2 +-
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental
index ad75c5d42..1d380168a 100644
--- a/docker/tpu/Dockerfile.incremental
+++ b/docker/tpu/Dockerfile.incremental
@@ -13,7 +13,9 @@ ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\
 WORKDIR /opt/levanter
 
 # We have to mkdir src/ to avoid setuptools error
-RUN mkdir -p /opt/levanter/src
+RUN mkdir -p /opt/levanter/src/levanter /opt/levanter/infra
+# we need the version of the package to be able to install it, which is in src/levanter/__init__.py
+COPY src/levanter/__init__.py /opt/levanter/src/levanter/
 ADD pyproject.toml README.md /opt/levanter/
 RUN pip install -e '.[test]'
 ADD . /opt/levanter
diff --git a/infra/launch.py b/infra/launch.py
index fe02a0b68..d12b7a390 100755
--- a/infra/launch.py
+++ b/infra/launch.py
@@ -6,10 +6,18 @@
 import json
 import os
 import subprocess
+import sys
 import time
 
-from . import push_docker
-from .helpers import cli
+
+# we do this nonsense so that it works as python -m levanter.infra.launch or python infra/launch.py
+try:
+    from . import push_docker
+    from .helpers import cli  # noqa: E402
+except ImportError:
+    sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    from infra import push_docker
+    from infra.helpers import cli  # noqa: E402
 
 
 def setup_vm_docker(tpu_name, zone, node_count, docker_base_image):
diff --git a/pyproject.toml b/pyproject.toml
index dca97dcfe..a2605729a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -112,4 +112,4 @@ levanter = "src/levanter"
 
 # set version from package
 [tool.setuptools.dynamic]
-version = "levanter.__version__"
+version = {attr = "levanter.__version__"}

From 7863989f088d762c70e5dd33d9bfbfb99dfbb0eb Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Thu, 22 Aug 2024 00:40:01 -0700
Subject: [PATCH 24/94] get default zone from gcloud config

---
 infra/helpers/cli.py | 13 ++++++++++++-
 infra/launch.py      |  8 +++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/infra/helpers/cli.py b/infra/helpers/cli.py
index d065c6a1d..90345a09c 100644
--- a/infra/helpers/cli.py
+++ b/infra/helpers/cli.py
@@ -81,9 +81,20 @@ def _tpu_ssh_multislice(tpu_name, zone, node_count, *args, ignore_failure=False)
 # Oddly enough, there's no API to simply fetch the current gcloud configuration...
 def gcloud_config():
     client = storage.Client()
-    return {
+    out: dict[str, str | None] = {
         "project": client.project,
     }
+    try:
+        out["zone"] = get_default_zone()
+    except subprocess.CalledProcessError:
+        out["zone"] = None
+
+    return out
+
+
+def get_default_zone() -> str:
+    result = subprocess.run(["gcloud", "config", "get-value", "compute/zone"], stdout=subprocess.PIPE, text=True)
+    return result.stdout.strip()
 
 
 def add_arg(
diff --git a/infra/launch.py b/infra/launch.py
index d12b7a390..1a594d323 100755
--- a/infra/launch.py
+++ b/infra/launch.py
@@ -212,7 +212,7 @@ def _default_run_id():
     cli.add_arg(parser, config, ["--tpu_type"], required=True)
     cli.add_arg(parser, config, ["--node_count"], default=1, type=int)
     cli.add_arg(parser, config, ["--version"], default="tpu-ubuntu2204-base")
-    cli.add_arg(parser, config, ["--zone"], required=True)
+    cli.add_arg(parser, config, ["--zone"], default=None, type=str)
     cli.add_arg(parser, config, ["--retries"], default=0, type=int)
     cli.add_arg(parser, config, ["--run_id"], default=_default_run_id(), type=str)
     cli.add_arg(parser, config, ["--docker_registry"], default="gcp", choices=["gcp", "ghcr"])
@@ -248,6 +248,12 @@ def _default_run_id():
     github_user = args.github_user
     github_token = args.github_token
 
+    if zone is None:
+        zone = cli.gcloud_config()["zone"]
+
+    if zone is None:
+        raise ValueError("Zone must be specified or set in gcloud config.")
+
     region = "-".join(zone.split("-")[:-1])
     env = {k: v for k, v in args.env}
 

From a550bb548f83f1c59e17cd7c45d6a55988744cd9 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Thu, 22 Aug 2024 00:44:21 -0700
Subject: [PATCH 25/94] factor out docker command, build

---
 infra/helpers/cli.py | 28 +++++++++++++++++++++++++++
 infra/launch.py      | 46 ++++++++++----------------------------------
 infra/push_docker.py | 34 +++++++++++++-------------------
 3 files changed, 51 insertions(+), 57 deletions(-)

diff --git a/infra/helpers/cli.py b/infra/helpers/cli.py
index 90345a09c..2ebdacb59 100644
--- a/infra/helpers/cli.py
+++ b/infra/helpers/cli.py
@@ -121,3 +121,31 @@ def load_config():
         return yaml.load(open(".config", "r"), Loader=yaml.SafeLoader)
     else:
         return {}
+
+
+def get_git_commit():
+    """Get the current git commit hash."""
+    return subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
+
+
+def make_docker_run_command(image_id, command, *, foreground, env):
+    docker_command = [
+        "docker",
+        "run",
+        "-t" if foreground else "-d",
+        "--name=levanter",
+        "--privileged",
+        "--shm-size=32gb",
+        "--net=host",
+        "--init",
+        "--mount",
+        "type=volume,source=levanter,target=/home/levanter",
+        "-v",
+        "/tmp:/tmp",
+    ]
+
+    for k, v in env.items():
+        docker_command.extend(["-e", k + f"='{str(v)}'"])
+
+    docker_command.extend([image_id, " ".join(command)])
+    return docker_command
diff --git a/infra/launch.py b/infra/launch.py
index 1a594d323..aa0b1ea42 100755
--- a/infra/launch.py
+++ b/infra/launch.py
@@ -20,7 +20,7 @@
     from infra.helpers import cli  # noqa: E402
 
 
-def setup_vm_docker(tpu_name, zone, node_count, docker_base_image):
+def setup_vm_docker(tpu_name, zone, node_count):
     """Change docker permissions on `tpu_name`, remove any old runs, and setup the cache volume."""
     cli.tpu_ssh(
         tpu_name,
@@ -260,28 +260,29 @@ def _default_run_id():
     if "WANDB_PROJECT" not in env:
         env["WANDB_PROJECT"] = "levanter"
 
+    env["GIT_COMMIT"] = cli.get_git_commit()
+    env["RUN_ID"] = run_id
+    env["WANDB_DOCKER"] = image_id
+
     if command[0] == "--":
         command = command[1:]
 
     # make an image tag based on the unix timestamp to ensure we always pull the latest image
     tag = int(time.time())
 
+    local_id = push_docker.build_docker(docker_file="docker/tpu/Dockerfile.incremental", image_name=image_id, tag=tag)
     if registry == "ghcr":
         full_image_id = push_docker.push_to_github(
-            local_image=image_id,
-            tag=tag,
+            local_id=local_id,
             github_user=github_user,
             github_token=github_token,
-            docker_file="docker/tpu/Dockerfile.incremental",
         )
     elif registry == "gcp":
         full_image_id = push_docker.push_to_gcp(
+            local_id=local_id,
             project_id=project,
             region=region,
             repository=docker_repository,
-            image_name=image_id,
-            tag=tag,
-            docker_file="docker/tpu/Dockerfile.incremental",
         )
     else:
         raise ValueError(f"Unknown docker registry: {args.docker_registry}")
@@ -304,36 +305,9 @@ def _default_run_id():
                 tpu_name=tpu_name,
                 zone=zone,
                 node_count=node_count,
-                docker_base_image=docker_base_image,
             )
 
-            git_commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
-
-            docker_command = [
-                "docker",
-                "run",
-                "-t" if foreground else "-d",
-                "--name=levanter",
-                "--privileged",
-                "--shm-size=32gb",
-                "--net=host",
-                "--init",
-                "--mount",
-                "type=volume,source=levanter,target=/home/levanter",
-                "-v",
-                "/tmp:/tmp",
-                "-e",
-                f"WANDB_DOCKER={image_id}",
-                "-e",
-                f"GIT_COMMIT={git_commit}",
-                "-e",
-                f"RUN_ID={run_id}",
-            ]
-
-            for k, v in env.items():
-                docker_command.extend(["-e", k + f"='{str(v)}'"])
-
-            docker_command.extend([full_image_id, " ".join(command)])
+            docker_command = cli.make_docker_run_command(full_image_id, command, env=env, foreground=foreground)
 
             print(f"Running on tpu_name... {tpu_name}")
             cli.tpu_ssh(tpu_name, zone, node_count, *docker_command)
@@ -346,7 +320,7 @@ def _default_run_id():
             break
 
     if autodelete:
-        print("Autodelete is set to True. Tear down machine...")
+        print("Autodelete is set to True. Tearing down machine...")
         cli.run_command(
             "gcloud",
             "alpha",
diff --git a/infra/push_docker.py b/infra/push_docker.py
index 66e10298c..378aa05d0 100644
--- a/infra/push_docker.py
+++ b/infra/push_docker.py
@@ -148,8 +148,7 @@ def build_docker(docker_file, image_name, tag) -> str:
     return f"{image_name}:{tag}"
 
 
-# Disabled until we can figure out how Docker hub organizations work
-def push_to_github(local_image, tag, github_user=None, github_token=None, docker_file=None):
+def push_to_github(local_id, github_user, github_token=None):
     """Pushes a local Docker image to Docker Hub."""
 
     # Authenticate the docker service with Github if a token exists
@@ -159,26 +158,24 @@ def push_to_github(local_image, tag, github_user=None, github_token=None, docker
         )
         print(login_process.communicate(input=github_token.encode(), timeout=10))
 
-    remote_name = f"ghcr.io/{github_user}/{local_image}:{tag}"
-    local_name = build_docker(docker_file=docker_file, image_name=local_image, tag=tag)
+    remote_name = f"ghcr.io/{github_user}/{local_id}"
 
-    _run(["docker", "tag", local_name, remote_name])
+    _run(["docker", "tag", local_id, remote_name])
     _run(["docker", "push", remote_name])
     return remote_name
 
 
-def push_to_gcp(project_id, region, repository, image_name, tag, docker_file) -> str:
+def push_to_gcp(local_id, project_id, region, repository) -> str:
     """Pushes a local Docker image to Artifact Registry."""
     configure_gcp_docker(project_id, region, repository)
-    local_image = build_docker(docker_file=docker_file, image_name=image_name, tag=tag)
 
     artifact_repo = f"{region}-docker.pkg.dev/{project_id}/{repository}"
 
-    full_image_name = f"{artifact_repo}/{image_name}:{tag}"
-    _run(["docker", "tag", local_image, full_image_name])
+    full_image_name = f"{artifact_repo}/{local_id}"
+    _run(["docker", "tag", local_id, full_image_name])
     _run(["docker", "push", full_image_name])
 
-    return f"{region}-docker.pkg.dev/{project_id}/{repository}/{image_name}:{tag}"
+    return f"{artifact_repo}/{local_id}"
 
 
 if __name__ == "__main__":
@@ -194,24 +191,19 @@ def push_to_gcp(project_id, region, repository, image_name, tag, docker_file) ->
     cli.add_arg(parser, config, ["--docker_file"], default="docker/tpu/Dockerfile.base", help="Dockerfile to use.")
 
     # push to either github or GCP
-    cli.add_arg(parser, config, ["--docker_target"], choices=["github", "gcp"], required=True)
+    cli.add_arg(parser, config, ["--docker_target"], choices=["github", "gcp", "ghcr"], required=True)
 
     args = parser.parse_args()
 
-    if args.docker_target == "github":
+    local_id = build_docker(docker_file=args.docker_file, image_name=args.image, tag=args.tag)
+
+    if args.docker_target in ["github", "ghcr"]:
         assert args.github_user, "Must specify --github_user when pushing to Github"
         assert args.github_token, "Must specify --github_token when pushing to Github"
-        push_to_github(args.image, args.tag, args.github_user, args.github_token, docker_file=args.docker_file)
+        push_to_github(local_id=local_id, github_user=args.github_user, github_token=args.github_token)
     else:
         assert args.region, "Must specify --region when pushing to GCP"
         assert args.project, "Must specify --project when pushing to GCP"
         assert args.repository, "Must specify --repository when pushing to GCP"
 
-        push_to_gcp(
-            args.project,
-            args.region,
-            args.repository,
-            args.image,
-            args.tag,
-            docker_file=args.docker_file,
-        )
+        push_to_gcp(local_id, args.project, args.region, args.repository)

From 715a04a4538007abef89c1f175cd864d5e045ff8 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Thu, 22 Aug 2024 11:02:08 -0700
Subject: [PATCH 26/94] Update beta2=0.95 (#701)

* Update beta2=0.95

* add notes for why
---
 src/levanter/optim/config.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/levanter/optim/config.py b/src/levanter/optim/config.py
index 920a34455..97996ea8c 100644
--- a/src/levanter/optim/config.py
+++ b/src/levanter/optim/config.py
@@ -215,7 +215,9 @@ class HessianOptConfig(OptimizerConfig, abc.ABC):
 class AdamConfig(OptimizerConfig):
     weight_decay: float = 0.1
     beta1: float = 0.9
-    beta2: float = 0.999
+    # cf https://docs.mosaicml.com/projects/composer/en/latest/api_reference/generated/composer.optim.DecoupledAdamW.html
+    # https://x.com/giffmana/status/1692641748445438301
+    beta2: float = 0.95 
     epsilon: float = 1e-8
     max_grad_norm: Optional[float] = 1.0
 

From c0ae0f9ce6cab486729d41dfa21a80a785fb9019 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Fri, 23 Aug 2024 12:20:02 -0700
Subject: [PATCH 27/94] publish full tpu image (#703)

* publish full tpu image

* pre-commit
---
 .github/workflows/docker-base-image.yaml | 4 ++++
 src/levanter/optim/config.py             | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/docker-base-image.yaml b/.github/workflows/docker-base-image.yaml
index de7f297f4..8ff9f7071 100644
--- a/.github/workflows/docker-base-image.yaml
+++ b/.github/workflows/docker-base-image.yaml
@@ -38,3 +38,7 @@ jobs:
       - name: Build and Push Docker image
         run: |
           docker buildx build --file docker/tpu/Dockerfile.base --tag ghcr.io/${{ github.repository_owner }}/levanter-base:latest --tag ghcr.io/${{ github.repository_owner }}/levanter-base:${{ env.DATE }} --push .
+
+      - name: Build and Push Incremental Docker image
+        run: |
+          docker buildx build --file docker/tpu/Dockerfile.incremental --tag ghcr.io/${{ github.repository_owner }}/levanter-tpu:latest --tag ghcr.io/${{ github.repository_owner }}/levanter-tpu:${{ env.DATE }} --push .
diff --git a/src/levanter/optim/config.py b/src/levanter/optim/config.py
index 97996ea8c..6d61159bd 100644
--- a/src/levanter/optim/config.py
+++ b/src/levanter/optim/config.py
@@ -217,7 +217,7 @@ class AdamConfig(OptimizerConfig):
     beta1: float = 0.9
     # cf https://docs.mosaicml.com/projects/composer/en/latest/api_reference/generated/composer.optim.DecoupledAdamW.html
     # https://x.com/giffmana/status/1692641748445438301
-    beta2: float = 0.95 
+    beta2: float = 0.95
     epsilon: float = 1e-8
     max_grad_norm: Optional[float] = 1.0
 

From ca7c9a647858ef52d117660b51189235851ccdde Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Fri, 23 Aug 2024 16:05:27 -0700
Subject: [PATCH 28/94] fix incremental build on CI (#704)

---
 docker/tpu/Dockerfile.incremental | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental
index d741674f3..f49372ea9 100644
--- a/docker/tpu/Dockerfile.incremental
+++ b/docker/tpu/Dockerfile.incremental
@@ -23,4 +23,6 @@ ADD . /opt/levanter
 
 # Add $EXTRA_CTX to the same location as in local machine.
 # so that the same (config) path(s) specified in train_lm.py argument still works
-COPY .mnt $EXTRA_CTX
+#COPY .mnt $EXTRA_CTX
+# it's already in the image, so we don't need to copy it. just move it if we set EXTRA_CTX
+RUN if [ -n "$EXTRA_CTX" ]; then mkdir -p $(dirname $EXTRA_CTX) && mv .mnt $EXTRA_CTX; fi

From d16482b9615e38ba0e8a262e859bd243bc0f223c Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Fri, 23 Aug 2024 16:23:00 -0700
Subject: [PATCH 29/94] sigh

---
 docker/tpu/Dockerfile.incremental | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental
index f49372ea9..203231206 100644
--- a/docker/tpu/Dockerfile.incremental
+++ b/docker/tpu/Dockerfile.incremental
@@ -25,4 +25,4 @@ ADD . /opt/levanter
 # so that the same (config) path(s) specified in train_lm.py argument still works
 #COPY .mnt $EXTRA_CTX
 # it's already in the image, so we don't need to copy it. just move it if we set EXTRA_CTX
-RUN if [ -n "$EXTRA_CTX" ]; then mkdir -p $(dirname $EXTRA_CTX) && mv .mnt $EXTRA_CTX; fi
+RUN if [ -n ".mnt" ]; then mkdir -p $(dirname $EXTRA_CTX) && mv .mnt $EXTRA_CTX; fi

From c823c75b50dcd403381c238efbca84a033828e79 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Fri, 23 Aug 2024 16:53:14 -0700
Subject: [PATCH 30/94] grr (#705)

---
 docker/tpu/Dockerfile.incremental | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental
index 203231206..3341ea2f2 100644
--- a/docker/tpu/Dockerfile.incremental
+++ b/docker/tpu/Dockerfile.incremental
@@ -25,4 +25,4 @@ ADD . /opt/levanter
 # so that the same (config) path(s) specified in train_lm.py argument still works
 #COPY .mnt $EXTRA_CTX
 # it's already in the image, so we don't need to copy it. just move it if we set EXTRA_CTX
-RUN if [ -n ".mnt" ]; then mkdir -p $(dirname $EXTRA_CTX) && mv .mnt $EXTRA_CTX; fi
+RUN if [ -f ".mnt" ]; then mkdir -p $(dirname $EXTRA_CTX) && mv .mnt $EXTRA_CTX; fi

From 7ec7bb54085fd2d0d48f184546dd482fcf15729a Mon Sep 17 00:00:00 2001
From: Abhinav Garg <abhinavg@stanford.edu>
Date: Sun, 25 Aug 2024 20:39:54 -0700
Subject: [PATCH 31/94] Adding multiple configs (#685)

* Adding multiple configs

* Add type hinting
---
 src/levanter/config.py | 61 +++++++++++++++++++++++++++++-------------
 1 file changed, 43 insertions(+), 18 deletions(-)

diff --git a/src/levanter/config.py b/src/levanter/config.py
index fefe1b2d3..8caafc256 100644
--- a/src/levanter/config.py
+++ b/src/levanter/config.py
@@ -93,26 +93,51 @@ def _maybe_get_config_path_and_cmdline_args(args: List[str]):
     If URL, we need to download it and save it to a temp file. We then want to remove --config_path
     from the cmdline args so that draccus doesn't try to load it as a config path and return it separately here
     along with the modified cmdline args.
+    We also accept ... --configs <config1> <config2> ... and concatenate them into a single config.
     """
-    if "--config_path" not in args and "--config" not in args:
+    if "--config_path" not in args and "--config" not in args and "--configs" not in args:
         return None, args
     else:
-        try:
-            config_path_index = args.index("--config_path")
-        except ValueError:
-            config_path_index = args.index("--config")
-
-        config_path = args[config_path_index + 1]
-
-        if urllib.parse.urlparse(config_path).scheme:
-            fs: AbstractFileSystem
-            fs, fs_path = fsspec.core.url_to_fs(config_path)
-            temp_file = tempfile.NamedTemporaryFile(prefix="config", suffix=".yaml", delete=False)
-            atexit.register(lambda: os.unlink(temp_file.name))
-            fs.get(fs_path, temp_file.name)
-            config_path = temp_file.name
+        config_args = ["--config_path", "--config", "--configs"]
+        found_indices = [args.index(arg) for arg in config_args if arg in args]
+        if len(found_indices) > 1:
+            raise ValueError(f"Multiple config args found in {args}")
+        config_path_index = found_indices[0] + 1
 
+        config_paths: List[str] = []
         args = args.copy()
-        del args[config_path_index]
-        del args[config_path_index]
-        return config_path, args
+        del args[config_path_index - 1]
+        config_path_index -= 1
+
+        while config_path_index < len(args) and not args[config_path_index].startswith("-"):
+
+            config_path = args[config_path_index]
+
+            if urllib.parse.urlparse(config_path).scheme:
+                fs: AbstractFileSystem
+                fs, fs_path = fsspec.core.url_to_fs(config_path)
+                temp_file = tempfile.NamedTemporaryFile(prefix="config", suffix=".yaml", delete=False)
+                atexit.register(lambda: os.unlink(temp_file.name))
+                fs.get(fs_path, temp_file.name)
+                config_path = temp_file.name
+
+            config_paths.append(config_path)
+            del args[config_path_index]
+
+        merged_config_path = None
+
+        if len(config_paths) == 1:
+            merged_config_path = config_paths[0]
+        elif len(config_paths) > 1:
+            # merge the configs by concatenating them
+            temp_merged_config_path = tempfile.NamedTemporaryFile(prefix="config_merged", suffix=".yaml", delete=False)
+            atexit.register(lambda: os.unlink(temp_merged_config_path.name))
+            with open(temp_merged_config_path.name, "w") as f:
+                for config_path in config_paths:
+                    with open(config_path) as config_file:
+                        f.write(config_file.read())
+            merged_config_path = temp_merged_config_path.name
+        else:
+            raise ValueError("No config path found in args")
+
+        return merged_config_path, args

From 20faff32516ecf5d6c459db382f12aa0a4405e0d Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Mon, 26 Aug 2024 11:02:06 -0700
Subject: [PATCH 32/94] Expose infra as a package, publish dev builds (#696)

    Publish levanter dev build automatically
    move stuff from infra/**/*.py into src/levanter/infra/{cli_helpers,tpu,docker}.py
    make the extra context stuff be optional (I actually have another way of dealing with it in Marin now
---
 .github/workflows/docker-base-image.yaml  |   2 +-
 .github/workflows/publish_dev.yaml        |  67 +++++
 docker/tpu/Dockerfile.base                |  10 +-
 docker/tpu/Dockerfile.incremental         |   4 -
 docs/Getting-Started-TPU-VM.md            |   9 +-
 infra/helpers/cli.py                      | 112 ---------
 infra/launch.py                           | 288 ++++------------------
 infra/push_docker.py                      | 229 +----------------
 pyproject.toml                            |  30 ++-
 src/levanter/__init__.py                  |   3 +
 {infra => src/levanter/infra}/__init__.py |   0
 src/levanter/infra/cli_helpers.py         | 132 ++++++++++
 src/levanter/infra/docker.py              | 224 +++++++++++++++++
 src/levanter/infra/tpus.py                | 255 +++++++++++++++++++
 14 files changed, 770 insertions(+), 595 deletions(-)
 create mode 100644 .github/workflows/publish_dev.yaml
 delete mode 100644 infra/helpers/cli.py
 rename {infra => src/levanter/infra}/__init__.py (100%)
 create mode 100644 src/levanter/infra/cli_helpers.py
 create mode 100644 src/levanter/infra/docker.py
 create mode 100644 src/levanter/infra/tpus.py

diff --git a/.github/workflows/docker-base-image.yaml b/.github/workflows/docker-base-image.yaml
index 8ff9f7071..a5ada69c3 100644
--- a/.github/workflows/docker-base-image.yaml
+++ b/.github/workflows/docker-base-image.yaml
@@ -1,4 +1,4 @@
-name: Build and Push Docker TPU Base Image
+name: Build and Push Docker TPU Images
 
 on:
   push:
diff --git a/.github/workflows/publish_dev.yaml b/.github/workflows/publish_dev.yaml
new file mode 100644
index 000000000..095167e3f
--- /dev/null
+++ b/.github/workflows/publish_dev.yaml
@@ -0,0 +1,67 @@
+name: Publish Dev Build
+
+on:
+  workflow_run:
+    workflows: ["Run Tests"]
+    types:
+      - completed
+    branches: [main]
+  workflow_dispatch:
+
+jobs:
+  build-package:
+    runs-on: ubuntu-latest
+    if: ${{  github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success'}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.x'
+
+      - name: Calculate Version and Build Number
+        run: |
+          PROJECT_VERSION=$(sed -n 's/__version__ = "\(.*\)"/\1/p' src/levanter/__init__.py)
+          BUILD_NUMBER=$(git rev-list --count HEAD)
+          FULL_VERSION="${PROJECT_VERSION}.dev${BUILD_NUMBER}"
+          echo "FULL_VERSION=${FULL_VERSION}" >> $GITHUB_ENV
+          echo "Calculated version with build number: $FULL_VERSION"
+      - name: Update pyproject.toml version
+        run: |
+          # replace the version in pyproject.toml
+           sed -i "s/version = \".*\"/version = \"$FULL_VERSION\"/g" pyproject.toml
+
+      - name: Build package
+        run: |
+          python -m pip install --upgrade pip
+          pip install build
+          python -m build
+
+      - name: Upload package
+        uses: actions/upload-artifact@v4
+        with:
+          name: package
+          path: dist/
+
+
+  # cf https://test.pypi.org/manage/project/levanter/settings/publishing/
+  publish-dev:
+    runs-on: ubuntu-latest
+    needs:
+        - build-package
+    permissions:
+      id-token: write
+    steps:
+      - name: Retrieve release distributions
+        uses: actions/download-artifact@v4
+        with:
+          name: package
+          path: dist/
+
+      - name: Publish release distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+
+
diff --git a/docker/tpu/Dockerfile.base b/docker/tpu/Dockerfile.base
index 958fde8b7..9e078c07c 100644
--- a/docker/tpu/Dockerfile.base
+++ b/docker/tpu/Dockerfile.base
@@ -5,14 +5,12 @@ RUN pip install virtualenv
 # venv binaries encode their directory, so we need to setup the venv in the final location
 RUN virtualenv -p python3.10 /opt/levanter/.venv
 ENV PATH /opt/levanter/.venv/bin:$PATH
-RUN /opt/levanter/.venv/bin/pip install -U "jax[tpu]==0.4.30" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+RUN /opt/levanter/.venv/bin/pip install -U uv "jax[tpu]==0.4.30" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
 
 # Install package dependencies to make incremental builds faster.
-WORKDIR /tmp/
-ADD pyproject.toml README.md /tmp/
-# work around setuptools bug
-RUN mkdir -p /tmp/src
-RUN pip install .[test]
+WORKDIR /opt/levanter
+ADD pyproject.toml README.md /opt/levanter/
+RUN uv sync --no-install-project
 
 FROM python:3.10
 
diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental
index 3341ea2f2..4b0ddb608 100644
--- a/docker/tpu/Dockerfile.incremental
+++ b/docker/tpu/Dockerfile.incremental
@@ -15,14 +15,10 @@ ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\
 
 WORKDIR /opt/levanter
 
-# We have to mkdir src/ to avoid setuptools error
-RUN mkdir -p /opt/levanter/src
 ADD pyproject.toml README.md /opt/levanter/
 RUN pip install -e '.[test]'
 ADD . /opt/levanter
 
 # Add $EXTRA_CTX to the same location as in local machine.
-# so that the same (config) path(s) specified in train_lm.py argument still works
-#COPY .mnt $EXTRA_CTX
 # it's already in the image, so we don't need to copy it. just move it if we set EXTRA_CTX
 RUN if [ -f ".mnt" ]; then mkdir -p $(dirname $EXTRA_CTX) && mv .mnt $EXTRA_CTX; fi
diff --git a/docs/Getting-Started-TPU-VM.md b/docs/Getting-Started-TPU-VM.md
index b4963fbde..3bcb26092 100644
--- a/docs/Getting-Started-TPU-VM.md
+++ b/docs/Getting-Started-TPU-VM.md
@@ -138,7 +138,7 @@ To run in the foreground, use `--foreground` with the `launch.py` script. You sh
 python infra/launch.py -- python src/levanter/main/train_lm.py --config_path config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
 ```
 
-### Using external directory/file
+### Using an external directory or file
 
 In case that you want to reference some external directory/file outside of the levanter repo, you can do it by adding the external directory/file to the docker image so that it becomes accessible in TPU instances. You can specify the path you want to add as extra buildl context by `--extra_context` with the `launch.py` script. Then, you should be able to use the external files in arguments in `train_lm.py` etc.
 ```bash
@@ -147,8 +147,10 @@ python infra/launch.py --extra_context <external path> -- python src/levanter/ma
 
 ### Babysitting Script
 
-If you are using a preemptible TPU VM, you probably want to use the "babysitting" script that automatically re-creates
-the VM. This is because preemptible instances can be preempted and will always be killed every 24 hours. You can run `launch.py` with the `--retries` and `--foreground` parameter to accomplish this. If `--retries` is greater than 1, `launch.py` will automatically attempt to re-create the VM and re-run the command if it fails. (`--foreground` is necessary to keep the script from returning immediately.)
+If you are using a preemptible TPU VM, you probably want to use the "babysitting" version of the script to keep an eye on
+the VM. This is because preemptible instances can be preempted and will always be killed every 24 hours.
+You can run `launch.py` with the `--retries` and `--foreground` parameter to accomplish this.
+If `--retries` is greater than 1, `launch.py` will automatically attempt to re-create the VM and re-run the command if it fails. (`--foreground` is necessary to keep the script from returning immediately.)
 
 ```bash
     python infra/launch.py --retries=100 --foreground --tpu_name=my_tpu -- python src/levanter/main/train_lm.py --config_path config/my_config.yaml \
@@ -185,6 +187,7 @@ Tokenizers and configuration files are loaded via `fsspec` which supports remote
 filesystems , so you can also copy your tokenizer or config file to GCS and use
 a `gs://` path to access it.
 
+
 ## Common Issues
 ### (CRFM) Permission denied on `/files`
 
diff --git a/infra/helpers/cli.py b/infra/helpers/cli.py
deleted file mode 100644
index d065c6a1d..000000000
--- a/infra/helpers/cli.py
+++ /dev/null
@@ -1,112 +0,0 @@
-import argparse
-import concurrent.futures
-import os
-import subprocess
-import typing
-
-import yaml
-from google.cloud import storage
-
-
-def run_command(*args, **kwargs):
-    print("Running:", " ".join(list(args)))
-    return subprocess.check_call(args, **kwargs)
-
-
-def add_ssh_key(ssh_key_filename):
-    # format 3072 SHA256:... key-name (RSA)
-    key_hash = subprocess.check_output(["ssh-keygen", "-lf", ssh_key_filename]).decode("utf-8").split()[1]
-    existing_keys = subprocess.check_output(["ssh-add", "-l"]).decode("utf-8").split("\n")
-    for key in existing_keys:
-        if key_hash in key:
-            return
-
-    subprocess.check_call(["ssh-add", ssh_key_filename])
-
-
-def tpu_ssh(tpu_name, zone, node_count, *args, ignore_failure=False):
-    add_ssh_key(os.path.expanduser("~/.ssh/google_compute_engine"))
-    try:
-        if node_count > 1:
-            return _tpu_ssh_multislice(tpu_name, zone, node_count, *args, ignore_failure=ignore_failure)
-
-        return run_command(
-            "gcloud",
-            "alpha",
-            "compute",
-            "tpus",
-            "tpu-vm",
-            "ssh",
-            tpu_name,
-            "--worker=all",
-            f"--zone={zone}",
-            "--command=%s" % " ".join(args),
-        )
-    except subprocess.CalledProcessError as e:
-        if ignore_failure:
-            print("Ignoring failure:", e)
-        else:
-            raise
-
-
-def _tpu_ssh_multislice(tpu_name, zone, node_count, *args, ignore_failure=False):
-    with concurrent.futures.ProcessPoolExecutor() as executor:
-        futures = [
-            executor.submit(
-                run_command,
-                "gcloud",
-                "alpha",
-                "compute",
-                "tpus",
-                "tpu-vm",
-                "ssh",
-                f"{tpu_name}-{i}",
-                "--worker=all",
-                f"--zone={zone}",
-                "--command=%s" % " ".join(args),
-            )
-            for i in range(node_count)
-        ]
-
-        for future in concurrent.futures.as_completed(futures):
-            try:
-                future.result()
-            except subprocess.CalledProcessError as e:
-                if ignore_failure:
-                    print("Ignoring failure:", e)
-                else:
-                    raise
-
-
-# Oddly enough, there's no API to simply fetch the current gcloud configuration...
-def gcloud_config():
-    client = storage.Client()
-    return {
-        "project": client.project,
-    }
-
-
-def add_arg(
-    parser: argparse.ArgumentParser, config: typing.Dict, flags: typing.List[str], required=False, default=None, **kw
-):
-    """Add an argument to the parser, using `config` or the environment to resolve default values."""
-    key = flags[0].lstrip("-").replace("-", "_")
-    if key in config:
-        default = config[key]
-
-    if key.upper() in os.environ:
-        default = os.environ[key.upper()]
-
-    if default is not None:
-        kw["default"] = default
-    elif required:
-        kw["required"] = True
-
-    parser.add_argument(*flags, **kw)
-
-
-def load_config():
-    if os.path.exists(".config"):
-        return yaml.load(open(".config", "r"), Loader=yaml.SafeLoader)
-    else:
-        return {}
diff --git a/infra/launch.py b/infra/launch.py
index 4f49689e1..2adb110d3 100755
--- a/infra/launch.py
+++ b/infra/launch.py
@@ -1,217 +1,41 @@
 #!/usr/bin/python
 
 import argparse
-import base64
 import getpass
-import json
-import os
 import subprocess
 import time
 from pathlib import Path
 
-from infra import push_docker
-from infra.helpers import cli
+import levanter.infra.cli_helpers as cli
+import levanter.infra.docker as docker
+import levanter.infra.tpus
+from levanter.infra.tpus import launch_job
 
 
-def setup_vm_docker(tpu_name, zone, node_count, docker_base_image):
-    """Change docker permissions on `tpu_name`, remove any old runs, and setup the cache volume."""
-    cli.tpu_ssh(
-        tpu_name,
-        zone,
-        node_count,
-        "sudo",
-        "usermod",
-        "-aG",
-        "docker",
-        getpass.getuser(),
-        "&&",
-        "sudo",
-        "docker",
-        "volume",
-        "create",
-        "--driver=local",
-        "levanter",
-        "&&",
-        "sudo",
-        "docker",
-        "rm",
-        "-f",
-        "levanter",
-    )
-
-
-def list_tpus(zone):
-    return json.loads(
-        subprocess.check_output(
-            [
-                "gcloud",
-                "alpha",
-                "compute",
-                "tpus",
-                "queued-resources",
-                "list",
-                f"--zone={zone}",
-                "--format=json(name.basename(), state)",
-            ]
-        )
-    )
-
-
-def describe_tpu(tpu_name, zone):
-    try:
-        return json.loads(
-            subprocess.check_output(
-                [
-                    "gcloud",
-                    "alpha",
-                    "compute",
-                    "tpus",
-                    "queued-resources",
-                    "describe",
-                    tpu_name,
-                    f"--zone={zone}",
-                    "--format=json(name.basename(), state)",
-                ]
-            )
-        )
-    except subprocess.CalledProcessError:
-        return None
-
-
-def start_tpu_vm(tpu_name, *, tpu_type, capacity_type, version, zone, autodelete, node_count):
-    tpu_stat = describe_tpu(tpu_name, zone)
-    if tpu_stat is not None:
-        if tpu_stat["state"]["state"] in ["FAILED", "SUSPENDED"]:
-            print("TPU suspended, bypassing autodelete config and deleting...")
-        elif not autodelete:
-            print("TPU already exists and autodelete is false, leaving it as is.")
-            return
-        else:
-            print("TPU already exists, deleting...")
-
-        cli.run_command(
-            "gcloud",
-            "alpha",
-            "compute",
-            "tpus",
-            "queued-resources",
-            "delete",
-            tpu_name,
-            "--quiet",
-            f"--zone={zone}",
-            "--force",
-        )
-
-    print(f"Creating new TPU {tpu_name} in {zone} of type {tpu_type}...")
-    command = [
-        "gcloud",
-        "alpha",
-        "compute",
-        "tpus",
-        "queued-resources",
-        "create",
-        tpu_name,
-        f"--accelerator-type={tpu_type}",
-        f"--runtime-version={version}",
-        f"--zone={zone}",
-        "--quiet",
-    ]
-    if capacity_type in ["preemptible", "best-effort"]:
-        command.append("--best-effort")
-    elif capacity_type == "reserved":
-        command.append("--reserved")
-    elif capacity_type == "spot":
-        command.append("--spot")
-    elif capacity_type == "on-demand" or capacity_type is None:
-        pass
-    else:
-        raise ValueError(f"Unknown capacity type: {capacity_type}")
-
-    if node_count == 1:
-        command.append(f"--node-id={tpu_name}")
-    else:
-        command.append(f"--node-count={node_count}")
-
-    cli.run_command(*command)
-
-    # wait for queued resource to complete
-    print("Checking TPU creation status every minute...")
-    waited = 0
-    while True:
-        time.sleep(60)
-        waited += 1
-
-        tpu_stat = describe_tpu(tpu_name, zone)
-        assert tpu_stat is not None, f"{tpu_name} creation failed."
-
-        match tpu_stat["state"]["state"]:
-            case "ACTIVE":
-                break
-            case "FAILED":
-                raise RuntimeError(
-                    f"{tpu_name} creation failed: {tpu_stat['state']['failedData']['error']['message']}"
-                )
-            case _:
-                print(f"Status is {tpu_stat['state']['state']}. Waited {waited} minutes...")
-
-
-def _default_run_id():
-    """Generate a run ID for wandb and continuation.
-
-    Wandb expects a base36 encoded ID of exactly 8 lowercase characters
-    or it won't generate a display name."""
-    rng_bytes = os.urandom(16)
-    run_id = base64.b32encode(rng_bytes)[:8].lower()
-    run_id = run_id.decode("utf-8")
-    assert len(run_id) == 8
-    for char in run_id:
-        assert char in "abcdefghijklmnopqrstuvwxyz0123456789"
-    return run_id
-
-
-if __name__ == "__main__":
+def main():
     parser = argparse.ArgumentParser()
     config = cli.load_config()
 
     cli.add_arg(
-        parser, config, ["--autodelete"], default=False, action="store_true", help="Delete TPU if it already exists."
+        parser, config, ["--autodelete"], default=False, action="store_true", help="Delete TPU after job completes."
     )
     cli.add_arg(parser, config, ["--docker_base_image"], default="ghcr.io/stanford-crfm/levanter-base:latest")
     cli.add_arg(parser, config, ["--docker_repository"], default="levanter")
     cli.add_arg(parser, config, ["--foreground"], default=False, action="store_true")
     cli.add_arg(parser, config, ["--image_name"], default=f"levanter-{getpass.getuser()}")
-    cli.add_arg(
-        parser,
-        config,
-        ["--capacity_type"],
-        default=None,
-        choices=["preemptible", "spot", "reserved", "on-demand", "best-effort"],
-    )
-    cli.add_arg(
-        parser,
-        config,
-        ["--preemptible"],
-        required=False,
-        action="store_const",
-        const="preemptible",
-        dest="capacity_type",
-    )
-    cli.add_arg(parser, config, ["--spot"], required=False, action="store_const", const="spot", dest="capacity_type")
-    cli.add_arg(
-        parser, config, ["--reserved"], required=False, action="store_const", const="reserved", dest="capacity_type"
-    )
+    cli.add_capacity_type_args(parser, config)
     cli.add_arg(parser, config, ["--project"], default=cli.gcloud_config()["project"])
     cli.add_arg(parser, config, ["--tpu_name"], required=True)
     cli.add_arg(parser, config, ["--tpu_type"], required=True)
     cli.add_arg(parser, config, ["--node_count"], default=1, type=int)
     cli.add_arg(parser, config, ["--version"], default="tpu-ubuntu2204-base")
-    cli.add_arg(parser, config, ["--zone"], required=True)
-    cli.add_arg(parser, config, ["--retries"], default=0, type=int)
-    cli.add_arg(parser, config, ["--run_id"], default=_default_run_id(), type=str)
+    cli.add_arg(parser, config, ["--zone"], default=None, type=str, required=False)
+    cli.add_arg(parser, config, ["--retries"], default=10, type=int)
+    cli.add_arg(parser, config, ["--run_id"], default=cli.default_run_id(), type=str)
     cli.add_arg(parser, config, ["--docker_registry"], default="gcp", choices=["gcp", "ghcr"])
     cli.add_arg(parser, config, ["--github_user"], type=str)
     cli.add_arg(parser, config, ["--github_token"], type=str)
-    cli.add_arg(parser, config, ["--extra_context"], type=Path, default=Path("config"))
+    cli.add_arg(parser, config, ["--extra_context"], type=Path, required=False, default=None)
 
     parser.add_argument(
         "-e", "--env", action="append", nargs=2, metavar=("KEY", "VALUE"), default=list(config.get("env", {}).items())
@@ -222,7 +46,6 @@ def _default_run_id():
 
     autodelete = args.autodelete
     command = args.command
-    docker_base_image = args.docker_base_image
     docker_repository = args.docker_repository
     foreground = args.foreground
     image_id = args.image_name
@@ -243,91 +66,64 @@ def _default_run_id():
     github_token = args.github_token
     extra_context = args.extra_context
 
+    if zone is None:
+        zone = cli.gcloud_config()["zone"]
+
+    if zone is None:
+        raise ValueError("Zone must be specified or set in gcloud config.")
+
     region = "-".join(zone.split("-")[:-1])
     env = {k: v for k, v in args.env}
 
     if "WANDB_PROJECT" not in env:
         env["WANDB_PROJECT"] = "levanter"
 
+    env["GIT_COMMIT"] = cli.get_git_commit()
+    env["RUN_ID"] = run_id
+    env["WANDB_DOCKER"] = image_id
+
     if command[0] == "--":
         command = command[1:]
 
     # make an image tag based on the unix timestamp to ensure we always pull the latest image
     tag = int(time.time())
 
+    with docker.copy_extra_ctx(extra_context) as extra_context:
+        build_args = {"EXTRA_CTX": extra_context} if extra_context else None
+        local_id = docker.build_docker(
+            docker_file="docker/tpu/Dockerfile.incremental", image_name=image_id, tag=tag, build_args=build_args
+        )
+
     if registry == "ghcr":
-        full_image_id = push_docker.push_to_github(
-            local_image=image_id,
-            tag=tag,
+        full_image_id = docker.push_to_github(
+            local_id=local_id,
             github_user=github_user,
             github_token=github_token,
-            docker_file="docker/tpu/Dockerfile.incremental",
-            extra_context=extra_context,
         )
     elif registry == "gcp":
-        full_image_id = push_docker.push_to_gcp(
+        full_image_id = docker.push_to_gcp(
+            local_id=local_id,
             project_id=project,
             region=region,
             repository=docker_repository,
-            image_name=image_id,
-            tag=tag,
-            docker_file="docker/tpu/Dockerfile.incremental",
-            extra_context=extra_context,
         )
     else:
-        raise ValueError(f"Unknown docker registry: {args.docker_registry}")
+        raise ValueError(f"Unknown docker registry: {registry}")
 
     for i in range(retries + 1):
         try:
-            start_tpu_vm(
+            launch_job(
+                command=command,
                 tpu_name=tpu_name,
                 tpu_type=tpu_type,
                 capacity_type=capacity_type,
-                version=version,
-                zone=zone,
-                autodelete=autodelete,
-                node_count=node_count,
-            )
-
-            # We don't technically need to setup on every run, but if we are working on a
-            # stale VM or a VM from e.g. spin-up-vm.sh, this ensures things always work.
-            setup_vm_docker(
-                tpu_name=tpu_name,
                 zone=zone,
                 node_count=node_count,
-                docker_base_image=docker_base_image,
+                full_image_id=full_image_id,
+                env=env,
+                foreground=foreground,
+                version=version,
             )
-
-            git_commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
-
-            docker_command = [
-                "docker",
-                "run",
-                "-t" if foreground else "-d",
-                "--name=levanter",
-                "--privileged",
-                "--shm-size=32gb",
-                "--net=host",
-                "--init",
-                "--mount",
-                "type=volume,source=levanter,target=/home/levanter",
-                "-v",
-                "/tmp:/tmp",
-                "-e",
-                f"WANDB_DOCKER={image_id}",
-                "-e",
-                f"GIT_COMMIT={git_commit}",
-                "-e",
-                f"RUN_ID={run_id}",
-            ]
-
-            for k, v in env.items():
-                docker_command.extend(["-e", k + f"='{str(v)}'"])
-
-            docker_command.extend([full_image_id, " ".join(command)])
-
-            print(f"Running on tpu_name... {tpu_name}")
-            cli.tpu_ssh(tpu_name, zone, node_count, *docker_command)
         except subprocess.CalledProcessError as e:  # noqa: F841
             print(f"Error running command {e.cmd}")
             if i < retries - 1:
@@ -337,8 +133,8 @@ def _default_run_id():
             break
 
     if autodelete:
-        print("Autodelete is set to True. Tear down machine...")
-        cli.run_command(
+        print("Autodelete is set to True. Tearing down machine...")
+        levanter.infra.tpus.run_command(
             "gcloud",
             "alpha",
             "compute",
@@ -350,3 +146,7 @@ def _default_run_id():
             f"--zone={zone}",
             "--force",
         )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/infra/push_docker.py b/infra/push_docker.py
index 181b5bf07..c4bb58f99 100644
--- a/infra/push_docker.py
+++ b/infra/push_docker.py
@@ -6,213 +6,11 @@
 It is not necessary to run this yourself unless you are deploying a new base image: the launch
 script will automatically build and deploy an image based on your current code.
 """
-
 import argparse
-import json
-import os
-import pty
-import shutil
-import subprocess
-import sys
-from pathlib import Path
-
-from infra.helpers import cli
-
-
-GCP_CLEANUP_POLICY = [
-    {
-        "name": "delete-stale",
-        "action": {"type": "Delete"},
-        "condition": {
-            "olderThan": "86400s",
-            "tagState": "ANY",
-        },
-    },
-    {
-        "name": "keep-latest",
-        "action": {"type": "Keep"},
-        "mostRecentVersions": {
-            "keepCount": 5,
-        },
-    },
-]
-
-
-def _rm(path):
-    if path.is_dir():
-        shutil.rmtree(path, ignore_errors=True)
-    elif path.is_file():
-        os.remove(path)
-    elif path.exists():
-        raise RuntimeError(f"Remove failed. Path ({path}) is neither a directory nor a file.")
-
-
-def _cp(src, dst):
-    # delete dst if exists
-    _rm(dst)
-
-    if src.is_dir():
-        shutil.copytree(src, dst)
-    elif src.is_file():
-        shutil.copy(src, dst)
-    else:
-        raise RuntimeError(f"Copy failed. Source path ({src}) is neither a directory nor a file. Check if it exists.")
-
-
-def _run(argv):
-    if sys.stdout.isatty():
-        exit_code = pty.spawn(argv)
-        if exit_code != 0:
-            raise subprocess.CalledProcessError(exit_code, argv)
-    else:
-        subprocess.check_output(argv, stderr=subprocess.STDOUT)
-
-
-def configure_gcp_docker(project_id, region, repository):
-    """Setup Artifact registry repository and configure permissions to enable TPU access."""
-    # check if the repository already exists
-    try:
-        _run(
-            ["gcloud", "artifacts", "repositories", "describe", f"--location={region}", repository],
-        )
-        print(f"Found existing artifact registry repository `{repository}`, skipping setup.")
-        return
-    except subprocess.CalledProcessError as e:
-        if b"NOT_FOUND" not in e.output:
-            raise
-
-    # Activate artifact registry and setup the repository.
-    _run(["gcloud", "services", "enable", "artifactregistry.googleapis.com"])
-
-    try:
-        _run(
-            [
-                "gcloud",
-                "artifacts",
-                "repositories",
-                "create",
-                repository,
-                f"--location={region}",
-                "--repository-format=docker",
-            ],
-        )
-    except subprocess.CalledProcessError as e:
-        # Ignore error if repository already exists.
-        if b"ALREADY_EXISTS" not in e.output:
-            print("Error creating repository: ", e.output)
-            raise
-
-    with open("/tmp/cleanup-policy.json", "w") as f:
-        json.dump(GCP_CLEANUP_POLICY, f, indent=2)
 
-    _run(
-        [
-            "gcloud",
-            "artifacts",
-            "repositories",
-            "set-cleanup-policies",
-            f"--location={region}",
-            "--policy=/tmp/cleanup-policy.json",
-            repository,
-        ]
-    )
-
-    # Grant public read access ('allUsers') for TPU VMs
-    _run(
-        [
-            "gcloud",
-            "artifacts",
-            "repositories",
-            "add-iam-policy-binding",
-            "--member=allUsers",
-            "--role=roles/artifactregistry.reader",
-            f"--location={region}",
-            repository,
-        ]
-    )
-
-    _run(
-        [
-            "gcloud",
-            "--project",
-            project_id,
-            "artifacts",
-            "repositories",
-            "add-iam-policy-binding",
-            repository,
-            "--location",
-            region,
-            "--member",
-            "allUsers",
-            "--role",
-            "roles/artifactregistry.reader",
-        ]
-    )
-
-    _run(["gcloud", "auth", "configure-docker", "--quiet", f"{region}-docker.pkg.dev"])
-
-
-def build_docker(docker_file, image_name, tag, mount_src) -> str:
-    """Builds a Docker image, enables artifact access, and pushes to Artifact Registry."""
-    # Copy external files temporarily to .mnt
-    mount_dst = Path(".mnt")
-    _cp(mount_src, mount_dst)
-
-    # Get mounting path in docker image.
-    levanter_path = Path("/opt/levanter")
-    extra_context = levanter_path / mount_src
-    _run(
-        [
-            "docker",
-            "buildx",
-            "build",
-            "--build-arg",
-            f"EXTRA_CTX={extra_context.resolve()}",
-            "--platform=linux/amd64",
-            "-t",
-            f"{image_name}:{tag}",
-            "-f",
-            docker_file,
-            ".",
-        ]
-    )
-    # clean up after building
-    _rm(mount_dst)
-
-    return f"{image_name}:{tag}"
-
-
-# Disabled until we can figure out how Docker hub organizations work
-def push_to_github(local_image, tag, github_user=None, github_token=None, docker_file=None, extra_context=None):
-    """Pushes a local Docker image to Docker Hub."""
-
-    # Authenticate the docker service with Github if a token exists
-    if github_token:
-        login_process = subprocess.Popen(
-            ["docker", "login", "ghcr.io", "-u", github_user, "--password-stdin"], stdin=subprocess.PIPE
-        )
-        print(login_process.communicate(input=github_token.encode(), timeout=10))
-
-    remote_name = f"ghcr.io/{github_user}/{local_image}:{tag}"
-    local_name = build_docker(docker_file=docker_file, image_name=local_image, tag=tag, mount_src=extra_context)
-
-    _run(["docker", "tag", local_name, remote_name])
-    _run(["docker", "push", remote_name])
-    return remote_name
-
-
-def push_to_gcp(project_id, region, repository, image_name, tag, docker_file, extra_context) -> str:
-    """Pushes a local Docker image to Artifact Registry."""
-    configure_gcp_docker(project_id, region, repository)
-    local_image = build_docker(docker_file=docker_file, image_name=image_name, tag=tag, mount_src=extra_context)
-
-    artifact_repo = f"{region}-docker.pkg.dev/{project_id}/{repository}"
-
-    full_image_name = f"{artifact_repo}/{image_name}:{tag}"
-    _run(["docker", "tag", local_image, full_image_name])
-    _run(["docker", "push", full_image_name])
-
-    return f"{region}-docker.pkg.dev/{project_id}/{repository}/{image_name}:{tag}"
+from levanter.infra import cli_helpers as cli
+from levanter.infra import docker
+from levanter.infra.docker import build_docker, push_to_gcp, push_to_github
 
 
 if __name__ == "__main__":
@@ -226,27 +24,24 @@ def push_to_gcp(project_id, region, repository, image_name, tag, docker_file, ex
     cli.add_arg(parser, config, ["--github_user"], default=None, help="Github user name.")
     cli.add_arg(parser, config, ["--github_token"], default=None, help="Github token.")
     cli.add_arg(parser, config, ["--docker_file"], default="docker/tpu/Dockerfile.base", help="Dockerfile to use.")
+    cli.add_arg(parser, config, ["--extra_context"], required=False, default=None)
 
     # push to either github or GCP
-    cli.add_arg(parser, config, ["--docker_target"], choices=["github", "gcp"], required=True)
+    cli.add_arg(parser, config, ["--docker_target"], choices=["github", "gcp", "ghcr"], required=True)
 
     args = parser.parse_args()
 
-    if args.docker_target == "github":
+    with docker.copy_extra_ctx(args.extra_context) as extra_ctx:
+        build_args = {"EXTRA_CTX": extra_ctx} if extra_ctx else None
+        local_id = build_docker(docker_file=args.docker_file, image_name=args.image, tag=args.tag)
+
+    if args.docker_target in ["github", "ghcr"]:
         assert args.github_user, "Must specify --github_user when pushing to Github"
         assert args.github_token, "Must specify --github_token when pushing to Github"
-        push_to_github(args.image, args.tag, args.github_user, args.github_token, docker_file=args.docker_file)
+        push_to_github(local_id=local_id, github_user=args.github_user, github_token=args.github_token)
     else:
         assert args.region, "Must specify --region when pushing to GCP"
         assert args.project, "Must specify --project when pushing to GCP"
         assert args.repository, "Must specify --repository when pushing to GCP"
 
-        push_to_gcp(
-            args.project,
-            args.region,
-            args.repository,
-            args.image,
-            args.tag,
-            docker_file=args.docker_file,
-            extra_context=Path("config"),
-        )
+        push_to_gcp(local_id, args.project, args.region, args.repository)
diff --git a/pyproject.toml b/pyproject.toml
index 5cdd9718b..7ba0b4c32 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,8 +6,8 @@ build-backend = "setuptools.build_meta"
 name = "levanter"
 version = "1.1"
 authors = [
-  { name="David Hall", email="dlwh@cs.stanford.edu" },
-  { name="Ivan Zhou", email="ivanz@stanford.edu" }
+    { name = "David Hall", email = "dlwh@cs.stanford.edu" },
+    { name = "Ivan Zhou", email = "ivanz@stanford.edu" },
 ]
 description = "Scalable Training for Foundation Models with Named Tensors and JAX"
 readme = "README.md"
@@ -48,7 +48,7 @@ dependencies = [
     "pydantic<3",
     "rich~=13.0",
     "filelock~=3.13",
-#    "ai2-olmo",
+    #    "ai2-olmo",
 ]
 
 [project.urls]
@@ -71,7 +71,14 @@ ensure_newline_before_comments = true
 line_length = 119
 src_paths = ["src", "tests"]
 known_haliax = ["haliax"]
-sections = ["FUTURE", "STDLIB", "THIRDPARTY", "HALIAX", "FIRSTPARTY", "LOCALFOLDER"]
+sections = [
+    "FUTURE",
+    "STDLIB",
+    "THIRDPARTY",
+    "HALIAX",
+    "FIRSTPARTY",
+    "LOCALFOLDER",
+]
 
 [tool.mypy]
 python_version = "3.10"
@@ -95,9 +102,16 @@ test = [
     "soundfile",
     "librosa",
     "pytest-forked",
-    "pytest-asyncio"
+    "pytest-asyncio",
 ]
 
-[tool.setuptools.packages.find]
-where = ["src"]
-include = ["levanter", "levanter.*"]
+#[tool.setuptools.packages.find]
+#where = ["src"]
+#include = ["levanter", "levanter.*"]
+
+
+[tool.setuptools]
+packages = ["levanter"]
+
+[tool.setuptools.package-dir]
+levanter = "src/levanter"
diff --git a/src/levanter/__init__.py b/src/levanter/__init__.py
index 093c8b545..2674d5bd6 100644
--- a/src/levanter/__init__.py
+++ b/src/levanter/__init__.py
@@ -11,3 +11,6 @@
 import levanter.visualization as visualization
 from levanter.tracker import current_tracker
 from levanter.trainer import initialize
+
+
+__version__ = "1.1"
diff --git a/infra/__init__.py b/src/levanter/infra/__init__.py
similarity index 100%
rename from infra/__init__.py
rename to src/levanter/infra/__init__.py
diff --git a/src/levanter/infra/cli_helpers.py b/src/levanter/infra/cli_helpers.py
new file mode 100644
index 000000000..5b1f87f01
--- /dev/null
+++ b/src/levanter/infra/cli_helpers.py
@@ -0,0 +1,132 @@
+import argparse
+import base64
+import os
+import subprocess
+from typing import Optional
+
+import yaml
+from google.cloud import storage
+
+
+# Oddly enough, there's no API to simply fetch the current gcloud configuration...
+def gcloud_config():
+    client = storage.Client()
+    out: dict[str, str | None] = {
+        "project": client.project,
+    }
+    try:
+        out["zone"] = get_default_zone()
+    except subprocess.CalledProcessError:
+        out["zone"] = None
+
+    return out
+
+
+def get_default_zone() -> Optional[str]:
+    try:
+        result = subprocess.run(["gcloud", "config", "get-value", "compute/zone"], stdout=subprocess.PIPE, text=True)
+        return result.stdout.strip()
+    except subprocess.CalledProcessError:
+        return None
+
+
+def add_arg(parser: argparse.ArgumentParser, config: dict, flags: list[str], required=False, default=None, **kw):
+    """Add an argument to the parser, using `config` or the environment to resolve default values."""
+    key = flags[0].lstrip("-").replace("-", "_")
+    if key in config:
+        default = config[key]
+
+    if key.upper() in os.environ:
+        default = os.environ[key.upper()]
+
+    if default is not None:
+        kw["default"] = default
+    elif required:
+        kw["required"] = True
+
+    parser.add_argument(*flags, **kw)
+
+
+def load_config():
+    if os.path.exists(".config"):
+        return yaml.load(open(".config", "r"), Loader=yaml.SafeLoader)
+    else:
+        return {}
+
+
+def get_git_commit():
+    """Get the current git commit hash."""
+    return subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
+
+
+def make_docker_run_command(image_id, command, *, foreground, env):
+    docker_command = [
+        "docker",
+        "run",
+        "-t" if foreground else "-d",
+        "--name=levanter",
+        "--privileged",
+        "--shm-size=32gb",
+        "--net=host",
+        "--init",
+        "--mount",
+        "type=volume,source=levanter,target=/home/levanter",
+        "-v",
+        "/tmp:/tmp",
+    ]
+
+    for k, v in env.items():
+        docker_command.extend(["-e", k + f"='{str(v)}'"])
+
+    docker_command.extend([image_id, " ".join(command)])
+    return docker_command
+
+
+def default_run_id():
+    """Generate a run ID for wandb and continuation.
+
+    Wandb expects a base36 encoded ID of exactly 8 lowercase characters
+    or it won't generate a display name."""
+    rng_bytes = os.urandom(16)
+    run_id = base64.b32encode(rng_bytes)[:8].lower()
+    run_id = run_id.decode("utf-8")
+    assert len(run_id) == 8
+    for char in run_id:
+        assert char in "abcdefghijklmnopqrstuvwxyz0123456789"
+    return run_id
+
+
+def add_capacity_type_args(parser, config):
+    """
+    Add capacity type arguments to the parser. This emulates the behavior of Google's `gcloud` CLI.
+    The capacity type will be stored in the `capacity_type` attribute of the parsed arguments.
+
+    Args:
+        parser: The argparse parser to add arguments to.
+        config: The configuration dictionary to use for defaults.
+
+
+    """
+    add_arg(
+        parser,
+        config,
+        ["--capacity_type"],
+        default=None,
+        choices=["preemptible", "spot", "reserved", "on-demand", "best-effort"],
+    )
+    add_arg(
+        parser,
+        config,
+        ["--preemptible"],
+        required=False,
+        action="store_const",
+        const="preemptible",
+        dest="capacity_type",
+    )
+    add_arg(parser, config, ["--spot"], required=False, action="store_const", const="spot", dest="capacity_type")
+    add_arg(
+        parser, config, ["--reserved"], required=False, action="store_const", const="reserved", dest="capacity_type"
+    )
+    add_arg(
+        parser, config, ["--on-demand"], required=False, action="store_const", const="on-demand", dest="capacity_type"
+    )
diff --git a/src/levanter/infra/docker.py b/src/levanter/infra/docker.py
new file mode 100644
index 000000000..2f8052f87
--- /dev/null
+++ b/src/levanter/infra/docker.py
@@ -0,0 +1,224 @@
+import json
+import os
+import pty
+import shutil
+import subprocess
+import sys
+from contextlib import contextmanager
+from pathlib import Path
+
+
+GCP_CLEANUP_POLICY = [
+    {
+        "name": "delete-stale",
+        "action": {"type": "Delete"},
+        "condition": {
+            "olderThan": "86400s",
+            "tagState": "ANY",
+        },
+    },
+    {
+        "name": "keep-latest",
+        "action": {"type": "Keep"},
+        "mostRecentVersions": {
+            "keepCount": 5,
+        },
+    },
+]
+
+
+def _rm(path):
+    if path.is_dir():
+        shutil.rmtree(path, ignore_errors=True)
+    elif path.is_file():
+        os.remove(path)
+    elif path.exists():
+        raise RuntimeError(f"Remove failed. Path ({path}) is neither a directory nor a file.")
+
+
+def _cp(src, dst):
+    # delete dst if exists
+    _rm(dst)
+
+    if src.is_dir():
+        shutil.copytree(src, dst)
+    elif src.is_file():
+        shutil.copy(src, dst)
+    else:
+        raise RuntimeError(f"Copy failed. Source path ({src}) is neither a directory nor a file. Check if it exists.")
+
+
+def _run(argv):
+    if sys.stdout.isatty():
+        output = []
+
+        def read(fd):
+            data = os.read(fd, 1024)
+            output.append(data)
+            return data
+
+        exit_code = pty.spawn(argv, master_read=read)
+        if exit_code != 0:
+            e = subprocess.CalledProcessError(exit_code, argv)
+            e.output = b"".join(output)
+            raise e
+
+        return b"".join(output)
+    else:
+        return subprocess.check_output(argv, stderr=subprocess.STDOUT)
+
+
+def configure_gcp_docker(project_id, region, repository):
+    """Setup Artifact registry repository and configure permissions to enable TPU access."""
+    # check if the repository already exists
+    try:
+        _run(
+            ["gcloud", "artifacts", "repositories", "describe", f"--location={region}", repository],
+        )
+        print(f"Found existing artifact registry repository `{repository}`, skipping setup.")
+        return
+    except subprocess.CalledProcessError as e:
+        if b"NOT_FOUND" not in e.output:
+            raise
+
+    # Activate artifact registry and setup the repository.
+    _run(["gcloud", "services", "enable", "artifactregistry.googleapis.com"])
+
+    try:
+        _run(
+            [
+                "gcloud",
+                "artifacts",
+                "repositories",
+                "create",
+                repository,
+                f"--location={region}",
+                "--repository-format=docker",
+            ],
+        )
+    except subprocess.CalledProcessError as e:
+        # Ignore error if repository already exists.
+        if b"ALREADY_EXISTS" not in e.output:
+            print("Error creating repository: ", e.output)
+            raise
+
+    with open("/tmp/cleanup-policy.json", "w") as f:
+        json.dump(GCP_CLEANUP_POLICY, f, indent=2)
+
+    _run(
+        [
+            "gcloud",
+            "artifacts",
+            "repositories",
+            "set-cleanup-policies",
+            f"--location={region}",
+            "--policy=/tmp/cleanup-policy.json",
+            repository,
+        ]
+    )
+
+    # Grant public read access ('allUsers') for TPU VMs
+    _run(
+        [
+            "gcloud",
+            "artifacts",
+            "repositories",
+            "add-iam-policy-binding",
+            "--member=allUsers",
+            "--role=roles/artifactregistry.reader",
+            f"--location={region}",
+            repository,
+        ]
+    )
+
+    _run(
+        [
+            "gcloud",
+            "--project",
+            project_id,
+            "artifacts",
+            "repositories",
+            "add-iam-policy-binding",
+            repository,
+            "--location",
+            region,
+            "--member",
+            "allUsers",
+            "--role",
+            "roles/artifactregistry.reader",
+        ]
+    )
+
+    _run(["gcloud", "auth", "configure-docker", "--quiet", f"{region}-docker.pkg.dev"])
+
+
+@contextmanager
+def copy_extra_ctx(extra_ctx):
+    """Context manager to handle copying and cleanup of extra context directory."""
+    if extra_ctx is not None:
+        mount_dst = Path(".mnt")
+        _cp(extra_ctx, mount_dst)
+        try:
+            yield mount_dst
+        finally:
+            _rm(mount_dst)
+    else:
+        yield None
+
+
+def build_docker(docker_file, image_name, tag, build_args=None) -> str:
+    """Builds a Docker image, enables artifact access, and pushes to Artifact Registry."""
+    args = [
+        "docker",
+        "buildx",
+        "build",
+        "--platform=linux/amd64",
+        # "--progress=plain",
+        "-t",
+        f"{image_name}:{tag}",
+    ]
+
+    if build_args:
+        for key, value in build_args.items():
+            args.extend(["--build-arg", f"{key}={value}"])
+
+    args.extend(
+        [
+            "-f",
+            docker_file,
+            ".",
+        ]
+    )
+    _run(args)
+
+    return f"{image_name}:{tag}"
+
+
+def push_to_github(local_id, github_user, github_token=None):
+    """Pushes a local Docker image to Docker Hub."""
+
+    # Authenticate the docker service with Github if a token exists
+    if github_token:
+        login_process = subprocess.Popen(
+            ["docker", "login", "ghcr.io", "-u", github_user, "--password-stdin"], stdin=subprocess.PIPE
+        )
+        print(login_process.communicate(input=github_token.encode(), timeout=10))
+
+    remote_name = f"ghcr.io/{github_user}/{local_id}"
+
+    _run(["docker", "tag", local_id, remote_name])
+    _run(["docker", "push", remote_name])
+    return remote_name
+
+
+def push_to_gcp(local_id, project_id, region, repository) -> str:
+    """Pushes a local Docker image to Artifact Registry."""
+    configure_gcp_docker(project_id, region, repository)
+
+    artifact_repo = f"{region}-docker.pkg.dev/{project_id}/{repository}"
+
+    full_image_name = f"{artifact_repo}/{local_id}"
+    _run(["docker", "tag", local_id, full_image_name])
+    _run(["docker", "push", full_image_name])
+
+    return f"{artifact_repo}/{local_id}"
diff --git a/src/levanter/infra/tpus.py b/src/levanter/infra/tpus.py
new file mode 100644
index 000000000..580f69a6b
--- /dev/null
+++ b/src/levanter/infra/tpus.py
@@ -0,0 +1,255 @@
+import concurrent.futures
+import getpass
+import json
+import os
+import subprocess
+import sys
+import time
+from typing import Optional
+
+from levanter.infra.cli_helpers import make_docker_run_command
+
+
+def setup_vm_docker(tpu_name, zone, node_count):
+    """Change docker permissions on `tpu_name`, remove any old runs, and setup the cache volume."""
+    tpu_ssh(
+        tpu_name,
+        zone,
+        node_count,
+        "sudo",
+        "usermod",
+        "-aG",
+        "docker",
+        getpass.getuser(),
+        "&&",
+        "sudo",
+        "docker",
+        "volume",
+        "create",
+        "--driver=local",
+        "levanter",
+        "&&",
+        "sudo",
+        "docker",
+        "rm",
+        "-f",
+        "levanter",
+    )
+
+
+def list_tpus(zone):
+    return json.loads(
+        subprocess.check_output(
+            [
+                "gcloud",
+                "alpha",
+                "compute",
+                "tpus",
+                "queued-resources",
+                "list",
+                f"--zone={zone}",
+                "--format=json(name.basename(), state)",
+            ]
+        )
+    )
+
+
+def describe_tpu(tpu_name, zone):
+    try:
+        return json.loads(
+            subprocess.check_output(
+                [
+                    "gcloud",
+                    "alpha",
+                    "compute",
+                    "tpus",
+                    "queued-resources",
+                    "describe",
+                    tpu_name,
+                    f"--zone={zone}",
+                    "--format=json(name.basename(), state)",
+                ]
+            )
+        )
+    except subprocess.CalledProcessError:
+        return None
+
+
+def start_tpu_vm(tpu_name, *, tpu_type, capacity_type, version, zone, node_count):
+    tpu_stat = describe_tpu(tpu_name, zone)
+    if tpu_stat is not None:
+        if tpu_stat["state"]["state"] in ["FAILED", "SUSPENDED"]:
+            print("TPU suspended,  deleting...", file=sys.stderr)
+
+            run_command(
+                "gcloud",
+                "alpha",
+                "compute",
+                "tpus",
+                "queued-resources",
+                "delete",
+                tpu_name,
+                "--quiet",
+                f"--zone={zone}",
+                "--force",
+            )
+        else:
+            print(f"TPU {tpu_name} already exists and is in state {tpu_stat['state']['state']}.", file=sys.stderr)
+            return
+
+    print(f"Creating new TPU {tpu_name} in {zone} of type {tpu_type}...", file=sys.stderr)
+    command = [
+        "gcloud",
+        "alpha",
+        "compute",
+        "tpus",
+        "queued-resources",
+        "create",
+        tpu_name,
+        f"--accelerator-type={tpu_type}",
+        f"--runtime-version={version}",
+        f"--zone={zone}",
+        "--quiet",
+    ]
+    if capacity_type in ["preemptible", "best-effort"]:
+        command.append("--best-effort")
+    elif capacity_type == "reserved":
+        command.append("--reserved")
+    elif capacity_type == "spot":
+        command.append("--spot")
+    elif capacity_type == "on-demand" or capacity_type is None:
+        pass
+    else:
+        raise ValueError(f"Unknown capacity type: {capacity_type}")
+
+    if node_count == 1:
+        command.append(f"--node-id={tpu_name}")
+    else:
+        command.append(f"--node-count={node_count}")
+
+    run_command(*command)
+
+    # wait for queued resource to complete
+    print("Checking TPU creation status every minute...")
+    waited = 0
+    while True:
+        time.sleep(60)
+        waited += 1
+
+        tpu_stat = describe_tpu(tpu_name, zone)
+        assert tpu_stat is not None, f"{tpu_name} creation failed."
+
+        match tpu_stat["state"]["state"]:
+            case "ACTIVE":
+                break
+            case "FAILED":
+                raise RuntimeError(
+                    f"{tpu_name} creation failed: {tpu_stat['state']['failedData']['error']['message']}"
+                )
+            case _:
+                print(f"Status is {tpu_stat['state']['state']}. Waited {waited} minutes...")
+
+
+def launch_job(
+    command: list[str],
+    tpu_name: str,
+    tpu_type: str,
+    capacity_type: str,
+    zone: str,
+    node_count: int,
+    full_image_id: str,
+    env: dict[str, str],
+    foreground: bool,
+    version: Optional[str] = None,
+):
+    start_tpu_vm(
+        tpu_name=tpu_name,
+        tpu_type=tpu_type,
+        capacity_type=capacity_type,
+        version=version,
+        zone=zone,
+        node_count=node_count,
+    )
+
+    # We don't technically need to setup on every run, but if we are working on a
+    # stale VM or a VM from e.g. spin-up-vm.sh, this ensures things always work.
+    setup_vm_docker(
+        tpu_name=tpu_name,
+        zone=zone,
+        node_count=node_count,
+    )
+
+    docker_command = make_docker_run_command(full_image_id, command, env=env, foreground=foreground)
+
+    print(f"Running on tpu_name... {tpu_name}")
+    tpu_ssh(tpu_name, zone, node_count, *docker_command)
+
+
+def run_command(*args, **kwargs):
+    print("Running:", " ".join(list(args)))
+    return subprocess.check_call(args, **kwargs)
+
+
+def add_ssh_key(ssh_key_filename):
+    # format 3072 SHA256:... key-name (RSA)
+    key_hash = subprocess.check_output(["ssh-keygen", "-lf", ssh_key_filename]).decode("utf-8").split()[1]
+    existing_keys = subprocess.check_output(["ssh-add", "-l"]).decode("utf-8").split("\n")
+    for key in existing_keys:
+        if key_hash in key:
+            return
+
+    subprocess.check_call(["ssh-add", ssh_key_filename])
+
+
+def tpu_ssh(tpu_name, zone, node_count, *args, ignore_failure=False):
+    add_ssh_key(os.path.expanduser("~/.ssh/google_compute_engine"))
+    try:
+        if node_count > 1:
+            return _tpu_ssh_multislice(tpu_name, zone, node_count, *args, ignore_failure=ignore_failure)
+
+        return run_command(
+            "gcloud",
+            "alpha",
+            "compute",
+            "tpus",
+            "tpu-vm",
+            "ssh",
+            tpu_name,
+            "--worker=all",
+            f"--zone={zone}",
+            "--command=%s" % " ".join(args),
+        )
+    except subprocess.CalledProcessError as e:
+        if ignore_failure:
+            print("Ignoring failure:", e)
+        else:
+            raise
+
+
+def _tpu_ssh_multislice(tpu_name, zone, node_count, *args, ignore_failure=False):
+    with concurrent.futures.ProcessPoolExecutor() as executor:
+        futures = [
+            executor.submit(
+                run_command,
+                "gcloud",
+                "alpha",
+                "compute",
+                "tpus",
+                "tpu-vm",
+                "ssh",
+                f"{tpu_name}-{i}",
+                "--worker=all",
+                f"--zone={zone}",
+                "--command=%s" % " ".join(args),
+            )
+            for i in range(node_count)
+        ]
+
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                future.result()
+            except subprocess.CalledProcessError as e:
+                if ignore_failure:
+                    print("Ignoring failure:", e)
+                else:
+                    raise

From 5c53a19fdcceb856325664b74bd795cc8bfc8d86 Mon Sep 17 00:00:00 2001
From: Abhinav Garg <abhinavg@stanford.edu>
Date: Mon, 26 Aug 2024 14:46:59 -0700
Subject: [PATCH 33/94] Llama mixture (#706)

* llama style mixture

* skiping data loader seek
---
 src/levanter/main/train_lm.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/levanter/main/train_lm.py b/src/levanter/main/train_lm.py
index 385b6fc2b..099fe2eb6 100644
--- a/src/levanter/main/train_lm.py
+++ b/src/levanter/main/train_lm.py
@@ -14,6 +14,7 @@
 
 import levanter
 from levanter import callbacks
+from levanter.checkpoint import load_checkpoint
 from levanter.compat.hf_checkpoints import HFCompatConfig, save_hf_checkpoint_callback
 from levanter.data.text import CausalLmDataset, LMDatasetConfig, LMMixtureDatasetConfig
 from levanter.models.gpt2 import Gpt2Config
@@ -50,6 +51,8 @@ class TrainLmConfig:
 
     update_hessian_steps: int = 10
     data_seed: Optional[int] = None  # if provided, will override the data seed from the trainer
+    initialize_from_checkpoint_path: Optional[str] = None
+    # if provided, will initialize from this checkpoint, used for llama style data mixture
 
 
 def main(config: TrainLmConfig):
@@ -126,6 +129,11 @@ def main(config: TrainLmConfig):
 
         state = trainer.initial_state(training_key, model_init=lambda: config.model.build(Vocab, key=model_key))
 
+        seek_dataloader = True
+        if int(state.step) == 0 and config.initialize_from_checkpoint_path is not None:
+            state = load_checkpoint(state, config.initialize_from_checkpoint_path)
+            seek_dataloader = False
+
         if int(state.step) == 0:
             # TODO: I don't love that we init the model twice, but it's not a big deal i think?
             if config.initialize_from_hf:
@@ -207,7 +215,7 @@ def compute_log_probs(model, example):
         # data loader. may need to seek to the right place if we're resuming
         train_loader = iter(trainer.sharded_loader(train_dataset, Batch))
 
-        if int(state.step) > 0:
+        if int(state.step) > 0 and seek_dataloader:
             # step is after the batch, so we need to seek to step
             # TODO: implement iter_data.seek(resume_step +1)
             import tqdm

From 277e7287939bce40e1d5d334602c0e1f88cccac3 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Mon, 26 Aug 2024 14:51:41 -0700
Subject: [PATCH 34/94] Fix base again (#707)

* fix incremental, add default version

* maybe i should stop messing around with docker
---
 docker/tpu/Dockerfile.base        | 10 ++++++----
 docker/tpu/Dockerfile.incremental |  1 +
 src/levanter/infra/tpus.py        |  5 ++++-
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/docker/tpu/Dockerfile.base b/docker/tpu/Dockerfile.base
index 9e078c07c..3c0d1cc5e 100644
--- a/docker/tpu/Dockerfile.base
+++ b/docker/tpu/Dockerfile.base
@@ -5,12 +5,14 @@ RUN pip install virtualenv
 # venv binaries encode their directory, so we need to setup the venv in the final location
 RUN virtualenv -p python3.10 /opt/levanter/.venv
 ENV PATH /opt/levanter/.venv/bin:$PATH
-RUN /opt/levanter/.venv/bin/pip install -U uv "jax[tpu]==0.4.30" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+RUN /opt/levanter/.venv/bin/pip install -U "jax[tpu]==0.4.30" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
 
 # Install package dependencies to make incremental builds faster.
-WORKDIR /opt/levanter
-ADD pyproject.toml README.md /opt/levanter/
-RUN uv sync --no-install-project
+WORKDIR /tmp/
+ADD pyproject.toml README.md /tmp/
+# work around setuptools bug
+RUN mkdir -p /tmp/src/levanter
+RUN pip install .[test]
 
 FROM python:3.10
 
diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental
index 4b0ddb608..f0369736c 100644
--- a/docker/tpu/Dockerfile.incremental
+++ b/docker/tpu/Dockerfile.incremental
@@ -16,6 +16,7 @@ ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\
 WORKDIR /opt/levanter
 
 ADD pyproject.toml README.md /opt/levanter/
+RUN mkdir -p /opt/levanter/src/levanter
 RUN pip install -e '.[test]'
 ADD . /opt/levanter
 
diff --git a/src/levanter/infra/tpus.py b/src/levanter/infra/tpus.py
index 580f69a6b..884d91faf 100644
--- a/src/levanter/infra/tpus.py
+++ b/src/levanter/infra/tpus.py
@@ -76,6 +76,8 @@ def describe_tpu(tpu_name, zone):
 
 
 def start_tpu_vm(tpu_name, *, tpu_type, capacity_type, version, zone, node_count):
+    if version is None:
+        version = "tpu-ubuntu2204-base"
     tpu_stat = describe_tpu(tpu_name, zone)
     if tpu_stat is not None:
         if tpu_stat["state"]["state"] in ["FAILED", "SUSPENDED"]:
@@ -107,10 +109,11 @@ def start_tpu_vm(tpu_name, *, tpu_type, capacity_type, version, zone, node_count
         "create",
         tpu_name,
         f"--accelerator-type={tpu_type}",
-        f"--runtime-version={version}",
         f"--zone={zone}",
         "--quiet",
     ]
+    if version is not None:
+        command.append(f"--runtime-version={version}")
     if capacity_type in ["preemptible", "best-effort"]:
         command.append("--best-effort")
     elif capacity_type == "reserved":

From 0c628d5af703094571aef1eaf54528d4757a694d Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Tue, 27 Aug 2024 16:28:26 -0700
Subject: [PATCH 35/94] Fix tpu vm autoshutdown (#708)

* fix autodeletion of TPU nodes?

* actually fix autodeletion of TPU nodes?

* wip

* fork fixes it

* sigh
---
 src/levanter/models/backpack.py   |  9 ++--
 src/levanter/models/whisper.py    |  4 +-
 src/levanter/utils/cloud_utils.py | 81 ++++++++++++++++++++++++++++---
 tests/test_doremi.py              |  1 +
 4 files changed, 83 insertions(+), 12 deletions(-)

diff --git a/src/levanter/models/backpack.py b/src/levanter/models/backpack.py
index aa6e4b7d6..2a955395f 100644
--- a/src/levanter/models/backpack.py
+++ b/src/levanter/models/backpack.py
@@ -116,8 +116,8 @@ def init(
         use_bias: bool = True,
     ) -> "BackpackMlp":
         k_fc, k_proj = jrandom.split(key, 2)
-        c_fc = hnn.Linear.init(Out=Mlp, In=Embed, key=k_fc, use_bias=use_bias)
-        c_proj = hnn.Linear.init(Out=Out, In=Mlp, key=k_proj, use_bias=use_bias)
+        c_fc = hnn.Linear.init(Out=Mlp, In=Embed, key=k_fc, use_bias=use_bias, out_first=False)
+        c_proj = hnn.Linear.init(Out=Out, In=Mlp, key=k_proj, use_bias=use_bias, out_first=False)
         if isinstance(activation_fn, str):
             activation_fn = ACT2FN[activation_fn]
         act = activation_fn  # type: ignore
@@ -176,7 +176,10 @@ def init(config: Gpt2Config, *, key) -> "WeightsOnlyAttention":
         Embed = config.Embed
 
         k_c, _ = jrandom.split(key, 2)
-        c_attn = hnn.Linear.init(In=Embed, Out=(Qk, config.Senses, config.SenseHeadDim), key=k_c, use_bias=use_bias)
+        # NB: out_first=True b/c the torch implementation uses Linear
+        c_attn = hnn.Linear.init(
+            In=Embed, Out=(Qk, config.Senses, config.SenseHeadDim), key=k_c, use_bias=use_bias, out_first=True
+        )
         dropout = hnn.Dropout(config.attn_pdrop)
 
         return WeightsOnlyAttention(config, c_attn, dropout)
diff --git a/src/levanter/models/whisper.py b/src/levanter/models/whisper.py
index 9116851d2..ad1db0ab6 100644
--- a/src/levanter/models/whisper.py
+++ b/src/levanter/models/whisper.py
@@ -136,8 +136,8 @@ class WhisperMlp(eqx.Module, StateDictSerializationMixin):
     @staticmethod
     def init(Embed: Axis, Mlp: Axis, activation_fn, *, key, use_bias: bool = True) -> "WhisperMlp":
         k_fc, k_proj = haliax.jax_utils.maybe_rng_split(key, 2)
-        fc1 = hnn.Linear.init(Out=Mlp, In=Embed, key=k_fc, use_bias=use_bias)
-        fc2 = hnn.Linear.init(Out=Embed, In=Mlp, key=k_proj, use_bias=use_bias)
+        fc1 = hnn.Linear.init(Out=Mlp, In=Embed, key=k_fc, use_bias=use_bias, out_first=False)
+        fc2 = hnn.Linear.init(Out=Embed, In=Mlp, key=k_proj, use_bias=use_bias, out_first=False)
         if isinstance(activation_fn, str):
             activation_fn = ACT2FN[activation_fn]
         act = activation_fn  # type: ignore
diff --git a/src/levanter/utils/cloud_utils.py b/src/levanter/utils/cloud_utils.py
index 6e9402b9e..8c6a1e884 100644
--- a/src/levanter/utils/cloud_utils.py
+++ b/src/levanter/utils/cloud_utils.py
@@ -1,4 +1,5 @@
 import contextlib
+import json
 import logging
 import os
 import shutil
@@ -29,33 +30,99 @@ def _checked_request(url):
         raise
 
 
+def _checked_delete(url):
+    # first get the token
+    token = _checked_request(
+        "http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/token"
+    )
+    token = json.loads(token)["access_token"]
+    headers = {"Authorization": f"Bearer {token}", "Metadata-Flavor": "Google"}
+    try:
+        response = requests.delete(url, headers=headers)
+        response.raise_for_status()
+        return response.text
+    except requests.exceptions.RequestException:
+        logger.exception(f"Could not delete {url} from metadata server. Is this a TPU VM?", exc_info=True)
+        raise
+
+
+def _shutdown_tpu_with_queued_resource():
+    queued_resource = _checked_request(
+        "http://metadata.google.internal/computeMetadata/v1/instance/attributes/queued-resource-name"
+    )
+    # queued resource looks like:
+    # projects/999999/locations/us-central2-b/queuedResources/NAME
+    # to delete we need to use delete against
+    # https://tpu.googleapis.com/v2/projects/9999/locations/us-central2-b/queuedResources/NAME?force=true
+    if queued_resource:
+        queued_resource_name = queued_resource.split("/")[-1]
+        # quiet really works like -y
+        if jax.process_index() == 0:
+            logger.critical(f"Found queued resource {queued_resource_name}. Attempting to delete it.")
+            # We need to use curl
+            # curl -X DELETE -H "Authorization: Bearer $(gcloud auth print-access-token)" \
+            # -H "Content-Type: application/json" \
+            # https://tpu.googleapis.com/v2/projects/my-project/locations/us-central2-b/queuedResources/my-queued-resource?force=true
+            # os.system(f"gcloud compute tpus queued-resources delete {queued_resource} --zone {zone} --force --quiet")
+            url = f"https://tpu.googleapis.com/v2/{queued_resource}?force=true"
+            _checked_delete(url)
+        return True
+    else:
+        logger.info("No queued resource found.")
+        return False
+
+
 def shutdown_tpu_vm(sleep_seconds=60 * 5):
     """You should probably call this from atexit or something like that."""
+    # fork a process to do the delete so the main process can exit before the delete is done
+    logger.info("Forking a process to delete...")
+    logger.critical(f"Create a file {SENTINEL_FILE} to cancel the shutdown")
+    logger.critical(f"$ touch {SENTINEL_FILE}")
+
+    # fork works better for our use case
+    pid = os.fork()
+    if pid == 0:
+        _do_shutdown_tpu_vm(sleep_seconds)
+        os._exit(0)
+    else:
+        logger.info(f"Forked process {pid} to delete TPU VM")
+
+
+def _do_shutdown_tpu_vm(sleep_seconds):
+    # the gcloud command we would run is something like:
+    # gcloud compute tpus tpu-vm delete tpu-vm-1 --zone us-central1-a --quiet
     try:
         zone = _checked_request("http://metadata.google.internal/computeMetadata/v1/instance/zone")
         zone = zone.split("/")[-1]
-        name = _checked_request("http://metadata.google.internal/computeMetadata/v1/attributes/instance-id")
+        name = _checked_request("http://metadata.google.internal/computeMetadata/v1/instance/attributes/instance-id")
+        project = _checked_request("http://metadata.google.internal/computeMetadata/v1/project/project-id")
     except requests.exceptions.RequestException:
         logger.warning("Could not get zone or instance-id from metadata server. Is this a TPU VM? Not shutting down.")
         return
 
-    # the gcloud command we would run is something like:
-    # gcloud compute tpus tpu-vm delete tpu-vm-1 --zone us-central1-a --quiet
     logger.critical(f"Shutting down TPU VM {name} in zone {zone} in {sleep_seconds} seconds")
-    logger.critical(f"Create a file {SENTINEL_FILE} to cancel the shutdown")
-    logger.critical(f"$ touch {SENTINEL_FILE}")
-
     time.sleep(sleep_seconds)
     if os.path.exists(SENTINEL_FILE):
         logger.critical(f"Found sentinel file {SENTINEL_FILE}, not shutting down TPU VM")
         return
+    logger.critical(f"Shutting down TPU VM {name} in zone {zone}")
+
+    try:
+        success = _shutdown_tpu_with_queued_resource()
+        if success:
+            return
+    except requests.exceptions.RequestException:
+        logger.info("This is not a queued resource, deleting the old fashioned way.")
 
     logger.critical(f"Shutting down TPU VM {name} in zone {zone}")
     if jax.process_index() != 0:
         logger.info(f"Letting process 0 handle the shutdown. We are process {jax.process_index()}")
         return
 
-    os.system(f"gcloud compute tpus tpu-vm delete {name} --zone {zone} --quiet")
+    # os.system(f"gcloud compute tpus tpu-vm delete {name} --zone {zone} --quiet")
+    # https://tpu.googleapis.com/v2/projects/PROJECT/locations/us-central2-b/nodes/NAME
+    url = f"http://tpu.googleapis.com/v2/projects/{project}/locations/{zone}/nodes/{name}"
+    _checked_delete(url)
 
 
 _sync_count = 0
diff --git a/tests/test_doremi.py b/tests/test_doremi.py
index 33b15f0ab..8f10139b0 100644
--- a/tests/test_doremi.py
+++ b/tests/test_doremi.py
@@ -113,6 +113,7 @@ def init_model():
             (),
             use_bias=True,
             key=model_key,
+            out_first=True,
         )
 
     m1, loss1 = fit_to_dataset(ds1)

From 97358f9bde968d70ccd7075b848c6c6169044462 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Tue, 27 Aug 2024 23:33:24 -0700
Subject: [PATCH 36/94] suppress stderr in describe_tpu since it usually logs a
 dumb error (#710)

---
 src/levanter/infra/tpus.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/levanter/infra/tpus.py b/src/levanter/infra/tpus.py
index 884d91faf..7e630f069 100644
--- a/src/levanter/infra/tpus.py
+++ b/src/levanter/infra/tpus.py
@@ -68,7 +68,8 @@ def describe_tpu(tpu_name, zone):
                     tpu_name,
                     f"--zone={zone}",
                     "--format=json(name.basename(), state)",
-                ]
+                ],
+                stderr=subprocess.DEVNULL,
             )
         )
     except subprocess.CalledProcessError:

From d674dd93a415a41f55ff047eec7d436657717a54 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Wed, 28 Aug 2024 17:34:34 -0700
Subject: [PATCH 37/94] wip

---
 config/data/dclm_llama3.yaml    | 75 +++++++++++++++++++++++++++++++++
 config/llama3_8b_with_dclm.yaml | 33 +++++++++++++++
 2 files changed, 108 insertions(+)
 create mode 100644 config/data/dclm_llama3.yaml
 create mode 100644 config/llama3_8b_with_dclm.yaml

diff --git a/config/data/dclm_llama3.yaml b/config/data/dclm_llama3.yaml
new file mode 100644
index 000000000..45d24900d
--- /dev/null
+++ b/config/data/dclm_llama3.yaml
@@ -0,0 +1,75 @@
+cache_dir: "gs://marin-us-central2/tokenized/llama3"
+tokenizer: "meta-llama/Meta-Llama-3.1-8B"
+stop_strategy: restart
+shuffle_buffer_size: 100000
+configs:
+  "dclm":
+    train_urls:
+      - gs://marin-data/datacomp/dclm-baseline-dedup-07-09/*/*/*.jsonl.zstd
+  # these are just for eval
+  "paloma/4chan":
+    validation_urls:
+      - gs://levanter-data/paloma/4chan_meta_sep/val/val*.jsonl.gz
+  "paloma/c4_100_domains":
+    validation_urls:
+      - gs://levanter-data/paloma/c4_100_domains/val/val*.jsonl.gz
+  "paloma/c4_en":
+    validation_urls:
+      - gs://levanter-data/paloma/c4_en/val/val*.jsonl.gz
+  "paloma/dolma-v1_5":
+    validation_urls:
+      - gs://levanter-data/paloma/dolma-v1_5/val/val*.jsonl.gz
+  "paloma/dolma_100_programing_languages":
+    validation_urls:
+      - gs://levanter-data/paloma/dolma_100_programing_languages/val/val*.jsonl.gz
+  "paloma/dolma_100_subreddits":
+    validation_urls:
+      - gs://levanter-data/paloma/dolma_100_subreddits/val/val*.jsonl.gz
+  "paloma/falcon-refinedweb":
+    validation_urls:
+      - gs://levanter-data/paloma/falcon-refinedweb/val/val*.jsonl.gz
+  "paloma/gab":
+    validation_urls:
+      - gs://levanter-data/paloma/gab/val/val*.jsonl.gz
+  "paloma/m2d2_s2orc_unsplit":
+    validation_urls:
+      - gs://levanter-data/paloma/m2d2_s2orc_unsplit/val/val*.jsonl.gz
+  "paloma/m2d2_wikipedia_unsplit":
+    validation_urls:
+      - gs://levanter-data/paloma/m2d2_wikipedia_unsplit/val/val*.jsonl.gz
+  "paloma/manosphere_meta_sep":
+    validation_urls:
+      - gs://levanter-data/paloma/manosphere_meta_sep/val/val*.jsonl.gz
+  "paloma/mc4":
+    validation_urls:
+      - gs://levanter-data/paloma/mc4/val/val*.jsonl.gz
+  "paloma/ptb":
+    validation_urls:
+      - gs://levanter-data/paloma/ptb/val/val*.jsonl.gz
+  "paloma/redpajama":
+    validation_urls:
+      - gs://levanter-data/paloma/redpajama/val/val*.jsonl.gz
+  "paloma/twitterAAE_HELM_fixed":
+    validation_urls:
+      - gs://levanter-data/paloma/twitterAAE_HELM_fixed/val/val*.jsonl.gz
+  "paloma/wikitext_103":
+    validation_urls:
+      - gs://levanter-data/paloma/wikitext_103/val/val*.jsonl.gz
+train_weights:
+  dclm: 1.0
+  paloma/4chan: 0.0
+  paloma/c4_100_domains: 0.0
+  paloma/c4_en: 0.0
+  paloma/dolma-v1_5: 0.0
+  paloma/dolma_100_programing_languages: 0.0
+  paloma/dolma_100_subreddits: 0.0
+  paloma/falcon-refinedweb: 0.0
+  paloma/gab: 0.0
+  paloma/m2d2_s2orc_unsplit: 0.0
+  paloma/m2d2_wikipedia_unsplit: 0.0
+  paloma/manosphere_meta_sep: 0.0
+  paloma/mc4: 0.0
+  paloma/ptb: 0.0
+  paloma/redpajama: 0.0
+  paloma/twitterAAE_HELM_fixed: 0.0
+  paloma/wikitext_103: 0.0
diff --git a/config/llama3_8b_with_dclm.yaml b/config/llama3_8b_with_dclm.yaml
new file mode 100644
index 000000000..b55dec3e5
--- /dev/null
+++ b/config/llama3_8b_with_dclm.yaml
@@ -0,0 +1,33 @@
+data: !include data/dclm_llama3.yaml
+model:  # 8B model with Llama 3 tokenizer
+  type: llama
+  seq_len: 2048
+  hidden_dim: 4096
+  intermediate_dim: 14336
+  num_layers: 32
+  num_heads: 32
+  num_kv_heads: 8
+  use_flash_attention: True
+trainer:
+  tracker:
+    type: wandb
+    entity: "stanford-mercury"
+    project: "marin"
+    tags: ["dclm", "7B", "llama"]
+
+  mp: p=f32,c=bfloat16
+  train_batch_size: 2048
+  num_train_steps: 69000  # 276e9 / 4M
+  steps_per_eval: 1000
+  tensor_parallel_axes: ["mlp", "heads"]
+  fsdp_axis: "embed"
+  batch_axis: "batch"
+optimizer:
+  learning_rate: 4e-4
+  weight_decay: 0.1
+  min_lr_ratio: 0.1
+  beta1: 0.9
+  beta2: 0.95
+  warmup: 5000
+
+z_loss_weight: 5e-6

From 4913df2e84ec53af7d957bb398a8a373aeda2521 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Wed, 28 Aug 2024 21:18:18 -0700
Subject: [PATCH 38/94] fix pyprojec.toml and pre-commit wandb issues (#712)

---
 .pre-commit-config.yaml       |  2 +-
 pyproject.toml                | 15 ++++-----------
 src/levanter/tracker/wandb.py |  6 +++---
 3 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 394f049f6..a2b97cc88 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -38,4 +38,4 @@ repos:
     hooks:
     -   id: mypy
         args: [--ignore-missing-imports]
-        additional_dependencies: [wandb, types-PyYAML]
+        additional_dependencies: [wandb==0.17.8, types-PyYAML]
diff --git a/pyproject.toml b/pyproject.toml
index 7ba0b4c32..f936e5afe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,7 @@ dependencies = [
     "tokenizers>=0.15.2",
     "transformers>=4.41.2",
     "optax>=0.1.9",
-    "wandb>=0.16.6,<0.17.6",
+    "wandb>=0.17.8",
     "scipy<=1.12.0",
     "draccus>=0.8.0",
     "pyarrow>=11.0.0",
@@ -105,13 +105,6 @@ test = [
     "pytest-asyncio",
 ]
 
-#[tool.setuptools.packages.find]
-#where = ["src"]
-#include = ["levanter", "levanter.*"]
-
-
-[tool.setuptools]
-packages = ["levanter"]
-
-[tool.setuptools.package-dir]
-levanter = "src/levanter"
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["levanter", "levanter.*"]
diff --git a/src/levanter/tracker/wandb.py b/src/levanter/tracker/wandb.py
index c98c0727c..1b0254261 100644
--- a/src/levanter/tracker/wandb.py
+++ b/src/levanter/tracker/wandb.py
@@ -182,9 +182,9 @@ def init(self, run_id: Optional[str]) -> WandbTracker:
             if wandb.run is not None:
                 wandb.run.log_artifact(str(requirements_path), name="requirements.txt", type="requirements")
 
-        wandb.summary["num_devices"] = jax.device_count()
-        wandb.summary["num_hosts"] = jax.process_count()
-        wandb.summary["backend"] = jax.default_backend()
+        wandb.summary["num_devices"] = jax.device_count()  # type: ignore
+        wandb.summary["num_hosts"] = jax.process_count()  # type: ignore
+        wandb.summary["backend"] = jax.default_backend()  # type: ignore
 
         return WandbTracker(r)
 

From 06dc3041add1874a8cdf8d98a0bbd0a60f28b723 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Wed, 28 Aug 2024 23:58:39 -0700
Subject: [PATCH 39/94] wip

---
 config/llama_22b_with_dclm.yaml  | 33 ++++++++++++++++++++++++++++++++
 config/llama_7b_with_dclm.yaml   |  2 +-
 src/levanter/utils/flop_utils.py |  2 +-
 3 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100644 config/llama_22b_with_dclm.yaml

diff --git a/config/llama_22b_with_dclm.yaml b/config/llama_22b_with_dclm.yaml
new file mode 100644
index 000000000..7dec40026
--- /dev/null
+++ b/config/llama_22b_with_dclm.yaml
@@ -0,0 +1,33 @@
+data: !include data/dclm_gpt_neo.yaml
+model:  # 22B class model
+  type: llama
+  seq_len: 2048
+  hidden_dim: 6144
+  intermediate_dim: 16384
+  num_layers: 56
+  num_heads: 48
+  num_kv_heads: 16
+  use_flash_attention: True
+trainer:
+  tracker:
+    type: wandb
+    entity: "stanford-mercury"
+    project: "marin"
+    tags: ["dclm", "22B", "llama"]
+
+  mp: p=f32,c=bfloat16
+  train_batch_size: 2048
+  num_train_steps: 100000
+  steps_per_eval: 1000
+  tensor_parallel_axes: ["mlp", "heads"]
+  fsdp_axis: "embed"
+  batch_axis: "batch"
+optimizer:
+  learning_rate: 1e-4
+  weight_decay: 0.1
+  min_lr_ratio: 0.1
+  beta1: 0.9
+  beta2: 0.95
+  warmup: 5000
+
+z_loss_weight: 5e-6
diff --git a/config/llama_7b_with_dclm.yaml b/config/llama_7b_with_dclm.yaml
index 9d30e9917..11a182f09 100644
--- a/config/llama_7b_with_dclm.yaml
+++ b/config/llama_7b_with_dclm.yaml
@@ -17,7 +17,7 @@ trainer:
 
   mp: p=f32,c=bfloat16
   train_batch_size: 2048
-  num_train_steps: 69000  # 276e9 / 4M
+  num_train_steps: 480000  # 2T / 4M
   steps_per_eval: 1000
   tensor_parallel_axes: ["mlp", "heads"]
   fsdp_axis: "embed"
diff --git a/src/levanter/utils/flop_utils.py b/src/levanter/utils/flop_utils.py
index 27278ca08..7d5c4fc0a 100644
--- a/src/levanter/utils/flop_utils.py
+++ b/src/levanter/utils/flop_utils.py
@@ -138,7 +138,7 @@ def lm_flops_per_token(
         "int8": 275e12,
     },
     # Source: https://cloud.google.com/tpu/docs/v5e
-    "TPU v5 lite": {
+    "tpu v5 lite": {
         "bf16": 197e12,
         "int8": 393e12,
     },

From ffa8e28aad5ab159a8fdea0e3a94380300e97833 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Thu, 29 Aug 2024 00:11:20 -0700
Subject: [PATCH 40/94] fix device kind for mfu v5e (#713)

---
 src/levanter/utils/flop_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/levanter/utils/flop_utils.py b/src/levanter/utils/flop_utils.py
index 27278ca08..7d5c4fc0a 100644
--- a/src/levanter/utils/flop_utils.py
+++ b/src/levanter/utils/flop_utils.py
@@ -138,7 +138,7 @@ def lm_flops_per_token(
         "int8": 275e12,
     },
     # Source: https://cloud.google.com/tpu/docs/v5e
-    "TPU v5 lite": {
+    "tpu v5 lite": {
         "bf16": 197e12,
         "int8": 393e12,
     },

From fd7888da26c556079d34b8d156b93b447ca31c1e Mon Sep 17 00:00:00 2001
From: Jason Wang <blahblahj.wsy@gmail.com>
Date: Sun, 1 Sep 2024 23:19:56 -0700
Subject: [PATCH 41/94] add haps configuration (cycle lr schedule) (#709)

* add haps configuration (cycle lr schedule)

* tiny fixes
---
 src/levanter/optim/config.py | 64 +++++++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 22 deletions(-)

diff --git a/src/levanter/optim/config.py b/src/levanter/optim/config.py
index 6d61159bd..c6b3bd783 100644
--- a/src/levanter/optim/config.py
+++ b/src/levanter/optim/config.py
@@ -24,7 +24,7 @@ class OptimizerConfig(draccus.ChoiceRegistry, abc.ABC):
 
     min_lr_ratio: float = 0.1
     warmup_ratio: Optional[float] = None  # Deprecated. fraction of training steps to use as warmup
-    """The lr scheduler operates on 4 stages: [warmup] - [stable] - [decay] - [cooldown]"""
+    """The lr scheduler operates on 4 stages: [warmup] - {[stable] - [decay]} x haps - [cooldown]"""
     warmup: float = 0.01
     """fraction of training steps to use as warmup, or steps to use. 0.0 means no warmup"""
     stable: float = 0.00
@@ -32,6 +32,8 @@ class OptimizerConfig(draccus.ChoiceRegistry, abc.ABC):
     cooldown: float = 0.0
     """fraction of training steps to use as cooldown, or steps to use. 0.0 means no cooldown"""
     lr_schedule: str = "cosine"  # constant, cosine, linear
+    haps: Optional[list[int]] = None
+    """list of integers indicating pit stop steps. See paper https://openreview.net/pdf?id=RSsavSvAvN"""
     weight_decay_modules: Optional[list[str] | str] = None
     """A regex or a list of strings to identify where to mask weight.
     For nano-GPT, this field can be set as `r".*attn.*weight|.*mlp.*weight|.*token_embeddings|.*position_embeddings"`"""
@@ -138,22 +140,13 @@ def mask_fn(model):
 
     def lr_scheduler(self, num_train_steps):
         warmup_steps = self._convert_warmup(num_train_steps)
-        stable_steps = _convert_ratio_or_steps(self.stable, num_train_steps)
         cooldown_steps = _convert_ratio_or_steps(self.cooldown, num_train_steps)
-        lr_decay_steps = num_train_steps - warmup_steps - stable_steps - cooldown_steps
-        min_lr = self.learning_rate * self.min_lr_ratio
+        if self.haps is None:
+            self.haps = []
+        self.haps.insert(0, warmup_steps)
+        self.haps.append(num_train_steps - cooldown_steps)
 
-        match self.lr_schedule:
-            case "constant":
-                schedule = optax.constant_schedule(self.learning_rate)
-            case "cosine":
-                schedule = optax.cosine_decay_schedule(self.learning_rate, lr_decay_steps, self.min_lr_ratio)
-            case "linear":
-                schedule = optax.linear_schedule(self.learning_rate, min_lr, lr_decay_steps)
-            case "inv_sqrt":
-                schedule = _inv_sqrt_decay_schedule(self.learning_rate, min_lr, warmup_steps, 10000)
-            case _:
-                raise ValueError(f"Unknown lr_schedule: {self.lr_schedule}")
+        min_lr = self.learning_rate * self.min_lr_ratio
 
         schedules = []
         boundaries = []
@@ -163,18 +156,37 @@ def lr_scheduler(self, num_train_steps):
             schedules.append(warmup)
             boundaries.append(warmup_steps)
 
-        if stable_steps != 0:
-            stable = optax.constant_schedule(self.learning_rate)
-            schedules.append(stable)
-            boundaries.append(warmup_steps + stable_steps)
-
-        schedules.append(schedule)
+        for start, end in zip(self.haps[:-1], self.haps[1:]):
+            cycle_steps = end - start
+            stable_steps = _convert_ratio_or_steps(self.stable, cycle_steps)
+            lr_decay_steps = cycle_steps - stable_steps
+
+            if stable_steps != 0:
+                stable = optax.constant_schedule(self.learning_rate)
+                schedules.append(stable)
+                boundaries.append(start + stable_steps)
+
+            match self.lr_schedule:
+                case "constant":
+                    schedule = optax.constant_schedule(self.learning_rate)
+                case "cosine":
+                    schedule = optax.cosine_decay_schedule(self.learning_rate, lr_decay_steps, self.min_lr_ratio)
+                case "linear":
+                    schedule = optax.linear_schedule(self.learning_rate, min_lr, lr_decay_steps)
+                case "inv_sqrt":
+                    schedule = _inv_sqrt_decay_schedule(self.learning_rate, min_lr, warmup_steps, 10000)
+                case "inv":
+                    schedule = _inv_decay_schedule(self.learning_rate, min_lr, lr_decay_steps)
+                case _:
+                    raise ValueError(f"Unknown lr_schedule: {self.lr_schedule}")
+
+            schedules.append(schedule)
+            boundaries.append(end)
 
         if cooldown_steps != 0:
             final_main_lr = schedule(lr_decay_steps)
             cooldown = optax.linear_schedule(final_main_lr, min_lr, cooldown_steps)
             schedules.append(cooldown)
-            boundaries.append(num_train_steps - cooldown_steps)
 
         if len(schedules) > 1:
             schedule = optax.join_schedules(schedules, boundaries)
@@ -197,6 +209,14 @@ def schedule(count):
     return schedule
 
 
+def _inv_decay_schedule(lr: float, min_lr: float, decay_steps: int):
+    def schedule(count):
+        decay = jnp.minimum(1.0, 1.0 / ((lr / min_lr - 1) * jnp.maximum(count, 1) / decay_steps + 1))
+        return jnp.maximum(lr * decay, min_lr)
+
+    return schedule
+
+
 def _convert_ratio_or_steps(ratio_or_steps: float, num_train_steps: int):
     if ratio_or_steps < 1.0:
         return int(ratio_or_steps * num_train_steps)

From 8dd32c6f4e0927a0500a0be2e5b2ccbb5df5e33e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 3 Sep 2024 21:36:28 -0700
Subject: [PATCH 42/94] Bump ray[default] from 2.34.0 to 2.35.0 (#714)

Bumps [ray[default]](https://github.com/ray-project/ray) from 2.34.0 to 2.35.0.
- [Release notes](https://github.com/ray-project/ray/releases)
- [Commits](https://github.com/ray-project/ray/compare/ray-2.34.0...ray-2.35.0)

---
updated-dependencies:
- dependency-name: ray[default]
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index f936e5afe..c94ec5a6a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,7 +44,7 @@ dependencies = [
     "matplotlib>=3.7.0",
     "tblib>=1.7.0,<4.0.0",
     "dataclasses-json~=0.6.4",
-    "ray[default]==2.34.0",
+    "ray[default]==2.35.0",
     "pydantic<3",
     "rich~=13.0",
     "filelock~=3.13",

From ea4ea25e40814f3761b16f58074e5bfe3f1b0599 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Wed, 4 Sep 2024 10:12:20 -0700
Subject: [PATCH 43/94] use hf config from checkpoint by default (#715)

---
 docs/dev/Port-Models.md               |  7 +------
 examples/alpaca-lora/alpaca_lora.py   |  2 +-
 examples/alpaca/alpaca.py             |  2 +-
 examples/gsm8k-lora/gsm8k_lora.py     |  7 ++-----
 src/levanter/compat/hf_checkpoints.py |  4 +++-
 src/levanter/data/audio.py            |  3 ++-
 src/levanter/main/doremi_lm.py        |  4 +---
 src/levanter/main/eval_lm.py          |  2 +-
 src/levanter/main/lora_lm.py          |  2 +-
 src/levanter/main/train_asr.py        |  4 +---
 src/levanter/main/train_lm.py         |  2 +-
 src/levanter/models/gemma.py          |  4 ++--
 src/levanter/models/gpt2.py           |  2 +-
 tests/test_gemma.py                   |  5 +----
 tests/test_hf_checkpoints.py          |  8 +++----
 tests/test_hf_gpt2_serialize.py       | 30 +++++++++++++++++++--------
 tests/test_llama.py                   |  2 +-
 tests/test_llama3.py                  |  2 +-
 tests/test_lora.py                    |  6 ++----
 tests/test_mistral.py                 |  5 +----
 tests/whisper_test.py                 |  2 +-
 21 files changed, 50 insertions(+), 55 deletions(-)

diff --git a/docs/dev/Port-Models.md b/docs/dev/Port-Models.md
index cc6cf3f7d..282f51508 100644
--- a/docs/dev/Port-Models.md
+++ b/docs/dev/Port-Models.md
@@ -242,12 +242,7 @@ with tempfile.TemporaryDirectory() as tmpdir:
     ck_path = f"{tmpdir}/hf_model"
     hf_model.save_pretrained(ck_path)
 
-    model = converter.load_pretrained(
-        config.model_type,
-        config,
-        ck_path,
-        resize_vocab_to_match_tokenizer=False
-    )
+    model = converter.load_pretrained(config.model_type, ref=ck_path, resize_vocab_to_match_tokenizer=False)
 
 # compare the output values between Levanter and HF
 # ...
diff --git a/examples/alpaca-lora/alpaca_lora.py b/examples/alpaca-lora/alpaca_lora.py
index de6e1f059..9488809ba 100644
--- a/examples/alpaca-lora/alpaca_lora.py
+++ b/examples/alpaca-lora/alpaca_lora.py
@@ -90,7 +90,7 @@ def train(config: TrainArgs):
         logger.info(f"Loading pretrained model from {converter.reference_checkpoint}")
         # load untrainable params in compute precision to save memory
         model: LmHeadModel = converter.load_pretrained(  # type: ignore
-            model_config.model_type, model_config, axis_mapping=parameter_axis_mapping, dtype=trainer.mp.compute_dtype
+            model_config.model_type, axis_mapping=parameter_axis_mapping, dtype=trainer.mp.compute_dtype
         )
 
         # Major difference from Alpaca: we loraize the model.
diff --git a/examples/alpaca/alpaca.py b/examples/alpaca/alpaca.py
index 0ecf78e6e..6578bc46c 100644
--- a/examples/alpaca/alpaca.py
+++ b/examples/alpaca/alpaca.py
@@ -234,7 +234,7 @@ def train(config: TrainArgs):
         # load the underlying hf model
         logger.info(f"Loading pretrained model from {converter.reference_checkpoint}")
         model: LmHeadModel = converter.load_pretrained(  # type: ignore
-            model_config.model_type, model_config, axis_mapping=parameter_axis_mapping, dtype=trainer.mp.param_dtype
+            model_config.model_type, axis_mapping=parameter_axis_mapping, dtype=trainer.mp.param_dtype
         )
 
         # this must be in jit b/c it uses arrays across accelerators (b/c of FSDP)
diff --git a/examples/gsm8k-lora/gsm8k_lora.py b/examples/gsm8k-lora/gsm8k_lora.py
index 0823686e1..b7ac3945c 100644
--- a/examples/gsm8k-lora/gsm8k_lora.py
+++ b/examples/gsm8k-lora/gsm8k_lora.py
@@ -161,11 +161,8 @@ def train(config: TrainArgs):
 
         # load the underlying hf model
         logger.info(f"Loading pretrained model from {converter.reference_checkpoint}")
-        model: LmHeadModel = converter.load_pretrained(  # type: ignore
-            config.model.model_type,
-            converter.default_config,
-            axis_mapping=parameter_axis_mapping,
-            dtype=trainer.mp.compute_dtype,
+        model: LmHeadModel = converter.load_pretrained(
+            config.model.model_type, axis_mapping=parameter_axis_mapping, dtype=trainer.mp.compute_dtype
         )
 
         # Major difference from Alpaca: we loraize the model.
diff --git a/src/levanter/compat/hf_checkpoints.py b/src/levanter/compat/hf_checkpoints.py
index 226c4e6cf..5727f4360 100644
--- a/src/levanter/compat/hf_checkpoints.py
+++ b/src/levanter/compat/hf_checkpoints.py
@@ -498,8 +498,8 @@ def _load_shards(self, id: str, index_file: str, rev: Optional[str], dtype) -> d
     def load_pretrained(
         self,
         lm_model_cls: Type[ModelWithHfSerializationMixin],
-        config: HFCompatConfig,
         ref: Optional[Union[str, RepoRef]] = None,
+        config: Optional[HFCompatConfig] = None,
         axis_mapping: Optional[ResourceMapping] = None,
         resize_vocab_to_match_tokenizer: bool = True,
         dtype: Optional[jnp.dtype] = None,
@@ -515,6 +515,8 @@ def load_pretrained(
         from contextlib import ExitStack
 
         hf_config = self.hf_config_from_hf_checkpoint(ref)
+        if config is None:
+            config = self.config_from_hf_config(hf_config)
         lm_model_cls = config.model_type
 
         # Vocab: first we have to resize the vocab as loaded from the checkpoint
diff --git a/src/levanter/data/audio.py b/src/levanter/data/audio.py
index b7b6fb15f..9a1f98d93 100644
--- a/src/levanter/data/audio.py
+++ b/src/levanter/data/audio.py
@@ -214,9 +214,10 @@ class AudioTaskConfig(abc.ABC):
     rows_per_chunk: int = DEFAULT_ROWS_PER_CHUNK  # number of rows to process and cache per chunk
     enforce_bos: bool = True  # whether to append bos even if the tokenizer doesn't
     enforce_eos: bool = True  # whether to append eos even if the tokenizer doesn't
+    max_length: int = 448
 
     @cached_property
-    def the_processor(self) -> PreTrainedTokenizerBase:
+    def the_processor(self) -> ProcessorMixin:
         return load_processor(self.processor)
 
     @cached_property
diff --git a/src/levanter/main/doremi_lm.py b/src/levanter/main/doremi_lm.py
index 42d84d54d..12b3e6ae0 100644
--- a/src/levanter/main/doremi_lm.py
+++ b/src/levanter/main/doremi_lm.py
@@ -88,9 +88,7 @@ def main(config: TrainLmConfig):
         # initialize the ref model
         if config.ref_model_from_hf:
             assert converter is not None
-            ref_model = converter.load_pretrained(
-                config.model.model_type, config.model, dtype=config.trainer.mp.compute_dtype
-            )
+            ref_model = converter.load_pretrained(config.model.model_type, dtype=config.trainer.mp.compute_dtype)
         else:
             ref_model_shape = eqx.filter_eval_shape(config.model.build, Vocab, key=jrandom.PRNGKey(0))
             ref_model = levanter.checkpoint.load_checkpoint(
diff --git a/src/levanter/main/eval_lm.py b/src/levanter/main/eval_lm.py
index 6d92c717a..df41750ab 100644
--- a/src/levanter/main/eval_lm.py
+++ b/src/levanter/main/eval_lm.py
@@ -102,7 +102,7 @@ def compute_loss(model: LmHeadModel, example: LmExample):
             converter: HFCheckpointConverter = model_config.hf_checkpoint_converter()
             converter = converter.replaced(reference_checkpoint=config.hf_checkpoint, tokenizer=tokenizer)
             model_from_hf_checkpoint = converter.load_pretrained(
-                model_config.model_type, model_config, config.hf_checkpoint, dtype=mp.compute_dtype
+                model_config.model_type, ref=config.hf_checkpoint, dtype=mp.compute_dtype
             )
             loss = callbacks.eval_loss_loop(compute_loss, model_from_hf_checkpoint, eval_loader, max_batches=total)
 
diff --git a/src/levanter/main/lora_lm.py b/src/levanter/main/lora_lm.py
index d3526a97f..9d7018c7e 100644
--- a/src/levanter/main/lora_lm.py
+++ b/src/levanter/main/lora_lm.py
@@ -92,7 +92,7 @@ def main(config: LoraLmConfig):
         # load the underlying hf model
         logger.info(f"Loading pretrained model from {converter.reference_checkpoint}")
         model = converter.load_pretrained(
-            model_config.model_type, model_config, axis_mapping=parameter_axis_mapping, dtype=trainer.mp.compute_dtype
+            model_config.model_type, axis_mapping=parameter_axis_mapping, dtype=trainer.mp.compute_dtype
         )
 
         @haliax.named_jit(axis_resources=parameter_axis_mapping, donate_args=(True))
diff --git a/src/levanter/main/train_asr.py b/src/levanter/main/train_asr.py
index 82d8dd601..2d0651198 100644
--- a/src/levanter/main/train_asr.py
+++ b/src/levanter/main/train_asr.py
@@ -144,9 +144,7 @@ def compute_loss(
                 # this is a bit gross, but we want to free up the memory from the model we just built
                 state = dataclasses.replace(state, model=None)
                 assert isinstance(config.model.asr_model_type, ModelWithHfSerializationMixin)
-                model = converter.load_pretrained(  # type: ignore
-                    config.model.asr_model_type, config.model, axis_mapping=parameter_axis_mapping
-                )
+                model = converter.load_pretrained(config.model.asr_model_type, axis_mapping=parameter_axis_mapping)
                 model = named_jit(trainer.mp.cast_to_param, parameter_axis_mapping)(model)
                 state = dataclasses.replace(state, model=model)
             else:
diff --git a/src/levanter/main/train_lm.py b/src/levanter/main/train_lm.py
index 099fe2eb6..e76f6bc5d 100644
--- a/src/levanter/main/train_lm.py
+++ b/src/levanter/main/train_lm.py
@@ -147,7 +147,7 @@ def main(config: TrainLmConfig):
                 gc.collect()
                 model = converter.load_pretrained(
                     config.model.model_type,
-                    config.model,
+                    config=config.model if not config.use_hf_model_config else None,
                     axis_mapping=parameter_axis_mapping,
                     dtype=trainer.mp.compute_dtype,
                 )
diff --git a/src/levanter/models/gemma.py b/src/levanter/models/gemma.py
index 2b2f95e93..1f8396b20 100644
--- a/src/levanter/models/gemma.py
+++ b/src/levanter/models/gemma.py
@@ -65,7 +65,7 @@ class GemmaConfig(HFCompatConfig):
         rope_scaling (Dict, ignored): dict containing the scaling configuration for the Rotary Positional Embedding.
     """
 
-    activation_function: str = "gelu"
+    activation_function: str = "gelu_new"
     initializer_range: float = 0.02
     layer_norm_epsilon: float = 1e-5
 
@@ -130,7 +130,7 @@ def from_hf_config(cls, hf_config: HfConfig):
         if hf_config.hidden_activation:
             activation_function = hf_config.hidden_activation
         else:
-            activation_function = hf_config.hidden_act
+            activation_function = "gelu_pytorch_tanh"
 
         if activation_function == "gelu_pytorch_tanh":
             activation_function = "gelu_new"
diff --git a/src/levanter/models/gpt2.py b/src/levanter/models/gpt2.py
index 178a8434c..a921074e9 100644
--- a/src/levanter/models/gpt2.py
+++ b/src/levanter/models/gpt2.py
@@ -39,7 +39,7 @@
 @LmConfig.register_subclass("gpt2")
 @dataclass(frozen=True)
 class Gpt2Config(HFCompatConfig):
-    seq_len: int = 512
+    seq_len: int = 1024
     hidden_dim: int = 768
     num_layers: int = 12
     num_heads: int = 12
diff --git a/tests/test_gemma.py b/tests/test_gemma.py
index 8eaaac045..64a3149fe 100644
--- a/tests/test_gemma.py
+++ b/tests/test_gemma.py
@@ -186,10 +186,7 @@ def test_gemma_roundtrip(scan_layers, num_kv_heads):
         torch_model.save_pretrained(f"{tmpdir}/torch_model")
 
         model = converter.load_pretrained(
-            converter.default_config.model_type,
-            converter.default_config,
-            f"{tmpdir}/torch_model",
-            resize_vocab_to_match_tokenizer=False,
+            converter.default_config.model_type, ref=f"{tmpdir}/torch_model", resize_vocab_to_match_tokenizer=False
         )
 
         def compute(input):
diff --git a/tests/test_hf_checkpoints.py b/tests/test_hf_checkpoints.py
index d0ad667a3..976b6bac4 100644
--- a/tests/test_hf_checkpoints.py
+++ b/tests/test_hf_checkpoints.py
@@ -59,9 +59,7 @@ def test_save_backpack_model_with_code():
         new_converter = converter.replaced(reference_checkpoint=tmpdir, trust_remote_code=True)
 
         assert new_converter.config_from_hf_config(config) == lev_config
-        loaded_model = new_converter.load_pretrained(
-            new_converter.default_config.model_type, new_converter.default_config
-        )
+        loaded_model = new_converter.load_pretrained(new_converter.default_config.model_type)
         loaded_model = inference_mode(loaded_model, True)
 
         assert loaded_model.config == lev_model.config
@@ -117,7 +115,9 @@ def test_save_sharded_checkpoints():
 
         assert len(glob.glob(tmpdir + "/*.safetensors")) > 1
 
-        loaded_model = converter.load_pretrained(Gpt2LMHeadModel, nano_model.config, ref=tmpdir, dtype=mp.param_dtype)
+        loaded_model = converter.load_pretrained(
+            Gpt2LMHeadModel, ref=tmpdir, config=nano_model.config, dtype=mp.param_dtype
+        )
 
         assert loaded_model.config == nano_model.config
         assert loaded_model.Vocab == nano_model.Vocab
diff --git a/tests/test_hf_gpt2_serialize.py b/tests/test_hf_gpt2_serialize.py
index 69ed85b9c..7a5475738 100644
--- a/tests/test_hf_gpt2_serialize.py
+++ b/tests/test_hf_gpt2_serialize.py
@@ -6,8 +6,10 @@
 import fsspec
 import jax
 import numpy as onp
+import pytest
 from fsspec import AbstractFileSystem
 from jax.random import PRNGKey
+from numpy.testing import assert_allclose
 from transformers import AutoModelForCausalLM
 from transformers import GPT2Config as HfGpt2Config
 from transformers import GPT2LMHeadModel as HfGpt2LMHeadModel
@@ -36,6 +38,8 @@ def test_hf_gpt2_roundtrip_fa():
     _roundtrip_compare_gpt2_checkpoint("gpt2", None, config=config)
 
 
+# TODO: gotta figure out why this regressed
+@pytest.mark.skip
 @skip_if_no_torch
 def test_mistral_gpt2_roundtrip():
     _roundtrip_compare_gpt2_checkpoint("stanford-crfm/expanse-gpt2-small-x777", "checkpoint-60000")
@@ -44,35 +48,42 @@ def test_mistral_gpt2_roundtrip():
 def _roundtrip_compare_gpt2_checkpoint(model_id, revision, config: Optional[Gpt2Config] = None):
     import torch
 
-    config = config or Gpt2Config()
-    converter = config.hf_checkpoint_converter()
+    if config is None:
+        converter = Gpt2Config(use_flash_attention=False).hf_checkpoint_converter()
+    else:
+        converter = config.hf_checkpoint_converter()
 
     torch_model: HfGpt2LMHeadModel = AutoModelForCausalLM.from_pretrained(model_id, revision=revision)
     torch_model.eval()
 
-    config = config or converter.default_config
     model: Gpt2LMHeadModel = cast(
         Gpt2LMHeadModel,
-        converter.load_pretrained(config.model_type, config, RepoRef(model_id, revision=revision)),
+        converter.load_pretrained(Gpt2LMHeadModel, RepoRef(model_id, revision=revision), config),
     )
     model = inference_mode(model, True)
 
+    lm_head = model.embeddings.token_embeddings
+    jax_lm_head = onp.array(lm_head.weight.array)
+    torch_lm_head = torch_model.transformer.wte.weight.detach().cpu().numpy()
+    assert torch_lm_head.shape == jax_lm_head.shape
+    assert_allclose(jax_lm_head, torch_lm_head, rtol=1e-4, atol=1e-4)
+
     input = hax.random.randint(PRNGKey(0), model.Pos, 0, model.Vocab.size)
+    attn_mask = AttentionMask.causal()
 
     # we compare softmaxes because the numerics are wonky and we usually just care about the softmax
     torch_out = torch_model(torch.from_numpy(onp.array(input.array)).to(torch.int32).unsqueeze(0))
     torch_out = torch_out.logits[0].detach().cpu().numpy()
     torch_out = jax.nn.softmax(torch_out, axis=-1)
 
-    attn_mask = AttentionMask.causal()
-
     def compute(input):
         return hax.nn.softmax(model(input, key=None, attn_mask=attn_mask), axis=model.Vocab)
 
     compute = jax.jit(compute)
     jax_out = compute(input).array
     assert torch_out.shape == jax_out.shape, f"{torch_out.shape} != {jax_out.shape}"
-    assert onp.isclose(torch_out, onp.array(jax_out), rtol=1e-2, atol=1e-2).all(), f"{torch_out} != {jax_out}"
+    # get the argmaxes for the two models
+    assert_allclose(torch_out, onp.array(jax_out), rtol=1e-2, atol=1e-2)
 
     with tempfile.TemporaryDirectory() as tmpdir:
         converter.save_pretrained(model, tmpdir)
@@ -83,6 +94,7 @@ def compute(input):
         torch_out2 = torch_model2(torch.from_numpy(onp.array(input.array)).to(torch.int32).unsqueeze(0))
         torch_out2 = torch_out2.logits[0].detach().cpu().numpy()
         torch_out2 = jax.nn.softmax(torch_out2, axis=-1)
+
         assert onp.isclose(torch_out2, onp.array(jax_out), rtol=1e-2, atol=1e-2).all(), f"{torch_out2} != {jax_out}"
 
 
@@ -111,7 +123,7 @@ def _compare_gpt2_checkpoint_gradients(model_id, revision, config: Optional[Gpt2
     torch_model: HfGpt2LMHeadModel = AutoModelForCausalLM.from_pretrained(model_id, revision=revision)
     torch_model.eval()
 
-    model = cast(Gpt2LMHeadModel, converter.load_pretrained(config.model_type, config, RepoRef(model_id, revision)))
+    model = cast(Gpt2LMHeadModel, converter.load_pretrained(config.model_type, RepoRef(model_id, revision), config))
     model = inference_mode(model, True)
 
     input = hax.random.randint(PRNGKey(0), model.Pos, 0, model.Vocab.size)
@@ -193,7 +205,7 @@ def test_hf_save_to_fs_spec():
         fs: AbstractFileSystem = fsspec.filesystem("memory")
         fs.get("model/", f"{tmpdir}/test", recursive=True)
 
-        loaded_model = converter.load_pretrained(Gpt2LMHeadModel, config, ref=f"{tmpdir}/test")
+        loaded_model = converter.load_pretrained(Gpt2LMHeadModel, ref=f"{tmpdir}/test")
 
         simple_dict = simple_model.to_state_dict()
         loaded_dict = loaded_model.to_state_dict()
diff --git a/tests/test_llama.py b/tests/test_llama.py
index cf96adaf2..3fc6a551e 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -297,7 +297,7 @@ def test_llama_roundtrip(scan_layers, num_kv_heads):
         torch_model.save_pretrained(f"{tmpdir}/torch_model")
 
         model = converter.load_pretrained(
-            LlamaLMHeadModel, config, f"{tmpdir}/torch_model", resize_vocab_to_match_tokenizer=False
+            LlamaLMHeadModel, ref=f"{tmpdir}/torch_model", resize_vocab_to_match_tokenizer=False
         )
 
         @hax.named_jit
diff --git a/tests/test_llama3.py b/tests/test_llama3.py
index 38d1c9fe6..a6f1d67b8 100644
--- a/tests/test_llama3.py
+++ b/tests/test_llama3.py
@@ -86,7 +86,7 @@ def test_llama_roundtrip():
         torch_model.save_pretrained(f"{tmpdir}/torch_model")
 
         model = converter.load_pretrained(
-            LlamaLMHeadModel, config, f"{tmpdir}/torch_model", resize_vocab_to_match_tokenizer=False
+            LlamaLMHeadModel, ref=f"{tmpdir}/torch_model", resize_vocab_to_match_tokenizer=False
         )
 
         @hax.named_jit
diff --git a/tests/test_lora.py b/tests/test_lora.py
index e23f02504..f9268d350 100644
--- a/tests/test_lora.py
+++ b/tests/test_lora.py
@@ -112,10 +112,8 @@ def test_lora_peft_integration():
 
     hf_dict = get_peft_model_state_dict(model)
 
-    converter = Gpt2Config().hf_checkpoint_converter
-    lev_model = converter.load_pretrained(
-        converter.default_config, converter.default_config.model_type, "stanford-crfm/expanse-gpt2-small-x777"
-    )
+    converter = Gpt2Config().hf_checkpoint_converter()
+    lev_model = converter.load_pretrained(converter.default_config.model_type, "stanford-crfm/expanse-gpt2-small-x777")
 
     lora_lev_model = loraize(lev_model, LoraConfig(r=8, target_modules=["c_attn"]), key=jax.random.PRNGKey(0))
     # for some dumb reason, the hf state dict starts with this prefix
diff --git a/tests/test_mistral.py b/tests/test_mistral.py
index f595b80c1..dbcb4555a 100644
--- a/tests/test_mistral.py
+++ b/tests/test_mistral.py
@@ -111,10 +111,7 @@ def test_mistral_roundtrip(num_kv_heads):
         torch_model.save_pretrained(f"{tmpdir}/torch_model")
 
         model = converter.load_pretrained(
-            converter.default_config.model_type,
-            converter.default_config,
-            f"{tmpdir}/torch_model",
-            resize_vocab_to_match_tokenizer=False,
+            converter.default_config.model_type, ref=f"{tmpdir}/torch_model", resize_vocab_to_match_tokenizer=False
         )
 
         def compute(input):
diff --git a/tests/whisper_test.py b/tests/whisper_test.py
index f90a13de7..048f7f124 100644
--- a/tests/whisper_test.py
+++ b/tests/whisper_test.py
@@ -137,7 +137,7 @@ def test_hf_roundtrip():
     torch_model: HfWhisperModel = HfWhisperModel.from_pretrained(model_id)
     torch_model.eval()
 
-    model: WhisperModel = cast(WhisperModel, converter.load_pretrained(config.model_type, config, RepoRef(model_id)))
+    model: WhisperModel = cast(WhisperModel, converter.load_pretrained(config.model_type, RepoRef(model_id), config))
     model = inference_mode(model, True)
 
     ds = load_dataset("WillHeld/test_librispeech_parquet", split="validation")

From fbe27bc7f3591a6403fef3b2b6c805114e9215f4 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Thu, 5 Sep 2024 11:10:11 -0700
Subject: [PATCH 44/94] Completely rework dataset/cache system: instant resume,
 perfect shuffle, stable mixtures and more (#716)

Introduces a massive rework of Levanter's cache system to support instant resume, perfect shuffle, stable mixtures and such.

The basic idea is to use TensorStore to store all of our data as a kind of janky column store (implemented in JaggedArrayStore) and pytrees of such (implemented in TreeStore).

TensorStore provides efficient storage and access to very large arrays. We still support streaming from an in progress cache via a new AsyncDataset class.

I've successfully tests this on the pile and, modulo the usual issues with the llama tokenizer on long documents/books, it behaves well.

Closes #626 #311 #119 #34
---
 .dockerignore                                 |    1 +
 .github/workflows/run_entry_tests.yaml        |    2 +-
 .github/workflows/run_ray_tests.yaml          |    4 +-
 .github/workflows/run_tests.yaml              |    2 +-
 config/data/redpajama_1b_source.yaml          |    1 -
 config/data/redpajama_1t_source.yaml          |    1 -
 config/data/rpv1_llama.yaml                   |    1 -
 config/gpt2_nano_mixture.yaml                 |    2 +-
 config/llama2_small_fast_mix.yaml             |  163 ++
 docs/Fine-Tuning.md                           |    2 +-
 docs/Training-On-Your-Data.md                 |   32 +-
 docs/design/Data-Loader-Design.md             |  334 ++--
 examples/alpaca-lora/alpaca_lora.py           |    2 +-
 examples/alpaca/alpaca.py                     |   82 +-
 examples/gsm8k-lora/gsm8k_lora.py             |   69 +-
 pyproject.toml                                |    3 +-
 scripts/repair_cache.py                       |   60 -
 src/levanter/callbacks.py                     |    5 +-
 src/levanter/checkpoint.py                    |    5 +-
 src/levanter/compat/hf_checkpoints.py         |    2 +-
 src/levanter/data/__init__.py                 |   36 +-
 src/levanter/data/_preprocessor.py            |   20 +-
 src/levanter/data/_process_interleave.py      |  338 ----
 src/levanter/data/_prp.py                     |   63 +
 src/levanter/data/_queue.py                   |   41 +-
 src/levanter/data/audio.py                    |  194 ++-
 src/levanter/data/dataset.py                  |  380 +++-
 src/levanter/data/loader.py                   |  452 +++--
 src/levanter/data/metrics_monitor.py          |   25 +-
 src/levanter/data/mixture.py                  |  224 ++-
 src/levanter/data/permutation.py              |  135 ++
 src/levanter/data/shard_cache.py              | 1521 -----------------
 ...arded_dataset.py => sharded_datasource.py} |   82 +-
 src/levanter/data/text.py                     |  552 +++---
 src/levanter/doremi.py                        |   28 +-
 src/levanter/eval.py                          |  101 +-
 src/levanter/logging.py                       |    4 +-
 src/levanter/main/cache_dataset.py            |    4 +-
 src/levanter/main/eval_lm.py                  |    6 +-
 src/levanter/main/lora_lm.py                  |   14 +-
 src/levanter/main/train_asr.py                |   13 +-
 src/levanter/main/train_lm.py                 |   32 +-
 src/levanter/main/viz_logprobs.py             |    8 +-
 src/levanter/store/__init__.py                |    6 +
 src/levanter/store/cache.py                   | 1321 ++++++++++++++
 src/levanter/store/jagged_array.py            |  508 ++++++
 src/levanter/store/stress_test_new_cache.py   |  149 ++
 src/levanter/store/tree_store.py              |  237 +++
 src/levanter/tracker/wandb.py                 |   10 +-
 src/levanter/trainer.py                       |   35 +-
 src/levanter/utils/background_iterable.py     |   85 +-
 src/levanter/utils/fsspec_utils.py            |    6 +
 src/levanter/utils/index.py                   |   46 +
 src/levanter/utils/jax_utils.py               |    4 +-
 src/levanter/utils/py_utils.py                |    7 -
 src/levanter/utils/ray_utils.py               |    4 +-
 src/levanter/utils/thread_utils.py            |   28 +
 tests/__init__.py                             |    0
 tests/test_audio.py                           |   52 +-
 tests/test_background_iterable.py             |   70 +-
 tests/test_checkpoint.py                      |    7 -
 tests/test_data_mixture.py                    |  126 --
 tests/test_doremi.py                          |   85 +-
 tests/test_in_progress_sequence.py            |  124 --
 tests/test_jagged_array.py                    |  305 ++++
 tests/test_llama.py                           |    3 -
 tests/test_lora.py                            |    5 +-
 tests/test_mixture.py                         |  155 ++
 tests/test_new_cache.py                       |  921 ++++++++++
 ...eplicated_loader.py => test_new_loader.py} |  143 +-
 tests/test_newdataset.py                      |  142 ++
 tests/test_prp.py                             |   87 +
 tests/test_shard_cache.py                     |  383 -----
 tests/test_sharded_dataset.py                 |    4 +-
 tests/test_sharded_loader.py                  |  299 ----
 tests/test_shuffle_dataset.py                 |   30 -
 tests/test_text.py                            |   30 +-
 tests/test_tokenized_document_cache.py        |  216 ---
 tests/test_tree_store.py                      |  435 +++++
 tests/test_utils.py                           |   12 +-
 tests/tiny_test_corpus.py                     |   20 +-
 81 files changed, 6691 insertions(+), 4455 deletions(-)
 create mode 100644 config/llama2_small_fast_mix.yaml
 delete mode 100644 scripts/repair_cache.py
 delete mode 100644 src/levanter/data/_process_interleave.py
 create mode 100644 src/levanter/data/_prp.py
 create mode 100644 src/levanter/data/permutation.py
 delete mode 100644 src/levanter/data/shard_cache.py
 rename src/levanter/data/{sharded_dataset.py => sharded_datasource.py} (89%)
 create mode 100644 src/levanter/store/__init__.py
 create mode 100644 src/levanter/store/cache.py
 create mode 100644 src/levanter/store/jagged_array.py
 create mode 100644 src/levanter/store/stress_test_new_cache.py
 create mode 100644 src/levanter/store/tree_store.py
 create mode 100644 src/levanter/utils/index.py
 create mode 100644 src/levanter/utils/thread_utils.py
 create mode 100644 tests/__init__.py
 delete mode 100644 tests/test_data_mixture.py
 delete mode 100644 tests/test_in_progress_sequence.py
 create mode 100644 tests/test_jagged_array.py
 create mode 100644 tests/test_mixture.py
 create mode 100644 tests/test_new_cache.py
 rename tests/{test_replicated_loader.py => test_new_loader.py} (62%)
 create mode 100644 tests/test_newdataset.py
 create mode 100644 tests/test_prp.py
 delete mode 100644 tests/test_shard_cache.py
 delete mode 100644 tests/test_sharded_loader.py
 delete mode 100644 tests/test_shuffle_dataset.py
 delete mode 100644 tests/test_tokenized_document_cache.py
 create mode 100644 tests/test_tree_store.py

diff --git a/.dockerignore b/.dockerignore
index 17fbbcfe1..45dfa95e6 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -2,6 +2,7 @@
 
 scratch
 cache
+new-cache
 wandb
 checkpoints
 
diff --git a/.github/workflows/run_entry_tests.yaml b/.github/workflows/run_entry_tests.yaml
index 9ab96773e..ab08013ee 100644
--- a/.github/workflows/run_entry_tests.yaml
+++ b/.github/workflows/run_entry_tests.yaml
@@ -21,7 +21,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install flake8 pytest
-          pip install . "jax[cpu]==${{ matrix.jax-version }}" "jaxlib==${{ matrix.jax-version }}"
+          pip install .[test] "jax[cpu]==${{ matrix.jax-version }}" "jaxlib==${{ matrix.jax-version }}"
           pip install soundfile librosa
       - name: Run entry tests with pytest
         run: |
diff --git a/.github/workflows/run_ray_tests.yaml b/.github/workflows/run_ray_tests.yaml
index c82611793..42139e576 100644
--- a/.github/workflows/run_ray_tests.yaml
+++ b/.github/workflows/run_ray_tests.yaml
@@ -21,8 +21,8 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install flake8 pytest
-          pip install . "jax[cpu]==${{ matrix.jax-version }}" "jaxlib==${{ matrix.jax-version }}"
+          pip install .[test] "jax[cpu]==${{ matrix.jax-version }}" "jaxlib==${{ matrix.jax-version }}"
           pip install soundfile librosa
       - name: Run ray tests with pytest
         run: |
-          XLA_FLAGS=--xla_force_host_platform_device_count=8 PYTHONPATH=$(pwd)/tests:$(pwd)/src:$(pwd):. pytest tests -m ray
+          PYTHONPATH=$(pwd)/tests:$(pwd)/src:$(pwd):. pytest tests -m ray
diff --git a/.github/workflows/run_tests.yaml b/.github/workflows/run_tests.yaml
index a6d4f7cab..6e9ed7024 100644
--- a/.github/workflows/run_tests.yaml
+++ b/.github/workflows/run_tests.yaml
@@ -21,7 +21,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install . "jax[cpu]==${{ matrix.jax-version }}" "jaxlib==${{ matrix.jax-version }}"
+          pip install .[test] "jax[cpu]==${{ matrix.jax-version }}" "jaxlib==${{ matrix.jax-version }}"
           pip install -r ./tests/requirements.txt
       - name: Test with pytest
         run: |
diff --git a/config/data/redpajama_1b_source.yaml b/config/data/redpajama_1b_source.yaml
index 1a873ed9a..aaa817399 100644
--- a/config/data/redpajama_1b_source.yaml
+++ b/config/data/redpajama_1b_source.yaml
@@ -3,4 +3,3 @@ cache_dir: gs://levanter-data/tokenized/redpajama-sample/
 tokenizer: EleutherAI/gpt-neox-20b
 splits:
   - train
-rows_per_chunk: 32768
diff --git a/config/data/redpajama_1t_source.yaml b/config/data/redpajama_1t_source.yaml
index f70f7c192..4a4b29474 100644
--- a/config/data/redpajama_1t_source.yaml
+++ b/config/data/redpajama_1t_source.yaml
@@ -3,4 +3,3 @@ cache_dir: gs://levanter-data/tokenized/redpajama/
 tokenizer: EleutherAI/gpt-neox-20b
 splits:
   - train
-rows_per_chunk: 4096
diff --git a/config/data/rpv1_llama.yaml b/config/data/rpv1_llama.yaml
index 92a46b50c..75a7b7ff2 100644
--- a/config/data/rpv1_llama.yaml
+++ b/config/data/rpv1_llama.yaml
@@ -1,5 +1,4 @@
 cache_dir: gs://levanter-data/tokenized/redpajama_v1_llama_mixture
-rows_per_chunk: 4096
 tokenizer: "meta-llama/Llama-2-7b-hf"
 configs:
   arxiv:
diff --git a/config/gpt2_nano_mixture.yaml b/config/gpt2_nano_mixture.yaml
index 673187312..2939b9e5e 100644
--- a/config/gpt2_nano_mixture.yaml
+++ b/config/gpt2_nano_mixture.yaml
@@ -7,7 +7,7 @@ data:
       id: dlwh/wikitext_103_detokenized
   train_weights:
     wikitext: 1.0
-    w2: 0
+    w2: 1.0
 model:
   type: gpt2
   hidden_dim: 32
diff --git a/config/llama2_small_fast_mix.yaml b/config/llama2_small_fast_mix.yaml
new file mode 100644
index 000000000..aabd17fae
--- /dev/null
+++ b/config/llama2_small_fast_mix.yaml
@@ -0,0 +1,163 @@
+data:
+  tokenizer: "meta-llama/Llama-2-7b-hf"
+  cache_dir: "gs://levanter-data/new-tokenized/pile_mix/"
+  shuffle:
+    era_length: 10000
+  configs:
+    arxiv:
+      train_urls:
+        - gs://levanter-data/pile-domains/arxiv/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/arxiv/val.jsonl.zst
+    books2:
+      train_urls:
+        - gs://levanter-data/pile-domains/books2/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/books2/val.jsonl.zst
+    books3:
+      train_urls:
+        - gs://levanter-data/pile-domains/books3/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/books3/val.jsonl.zst
+    dm_math:
+      train_urls:
+        - gs://levanter-data/pile-domains/dm_math/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/dm_math/val.jsonl.zst
+    enron:
+      train_urls:
+        - gs://levanter-data/pile-domains/enron/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/enron/val.jsonl.zst
+    europarl:
+      train_urls:
+        - gs://levanter-data/pile-domains/europarl/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/europarl/val.jsonl.zst
+    free_law:
+      train_urls:
+        - gs://levanter-data/pile-domains/freelaw/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/freelaw/val.jsonl.zst
+    github:
+      train_urls:
+        - gs://levanter-data/pile-domains/github/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/github/val.jsonl.zst
+    hackernews:
+      train_urls:
+        - gs://levanter-data/pile-domains/hackernews/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/hackernews/val.jsonl.zst
+    nih:
+      train_urls:
+        - gs://levanter-data/pile-domains/nih/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/nih/val.jsonl.zst
+    opensubtitles:
+      train_urls:
+        - gs://levanter-data/pile-domains/opensubtitles/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/opensubtitles/val.jsonl.zst
+    owt2:
+      train_urls:
+        - gs://levanter-data/pile-domains/owt2/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/owt2/val.jsonl.zst
+    pg_19:
+      train_urls:
+        - gs://levanter-data/pile-domains/pg_19/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/pg_19/val.jsonl.zst
+    philpapers:
+      train_urls:
+        - gs://levanter-data/pile-domains/philpapers/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/philpapers/val.jsonl.zst
+    pile_cc:
+      train_urls:
+        - gs://levanter-data/pile-domains/pile_cc/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/pile_cc/val.jsonl.zst
+    pubmed_abs:
+      train_urls:
+        - gs://levanter-data/pile-domains/pubmed_abs/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/pubmed_abs/val.jsonl.zst
+    pubmed_central:
+      train_urls:
+        - gs://levanter-data/pile-domains/pubmed_central/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/pubmed_central/val.jsonl.zst
+    stack_exchange:
+      train_urls:
+        - gs://levanter-data/pile-domains/stack_exchange/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/stack_exchange/val.jsonl.zst
+    ubuntu_irc:
+      train_urls:
+        - gs://levanter-data/pile-domains/ubuntu_irc/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/ubuntu_irc/val.jsonl.zst
+    uspto:
+      train_urls:
+        - gs://levanter-data/pile-domains/uspto/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/uspto/val.jsonl.zst
+    wiki_en:
+      train_urls:
+        - gs://levanter-data/pile-domains/wiki_en/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/wiki_en/val.jsonl.zst
+    youtube_subtitles:
+      train_urls:
+        - gs://levanter-data/pile-domains/youtube_subtitles/{00..29}.jsonl.zst
+      validation_urls:
+        - gs://levanter-data/pile-domains/youtube_subtitles/val.jsonl.zst
+  train_weights:
+    # these weights come from the paper https://arxiv.org/pdf/2101.00027.pdf
+    pile_cc: 0.1811
+    pubmed_central: 0.1440
+    books3: 0.1207
+    owt2: 0.1001
+    arxiv: 0.0896
+    github: 0.0759
+    free_law: 0.0612
+    stack_exchange: 0.0513
+    uspto: 0.0365
+    pubmed_abs: 0.0307
+    pg_19: 0.0217
+    opensubtitles: 0.0155
+    wiki_en: 0.0153
+    dm_math: 0.0124
+    ubuntu_irc: 0.0088
+    books2: 0.0075
+    europarl: 0.0073
+    hackernews: 0.0062
+    youtube_subtitles: 0.0060
+    philpapers: 0.0038
+    nih: 0.0030
+    enron: 0.0014
+model:
+  type: llama
+  hidden_dim: 768
+  intermediate_dim: 2048
+  num_heads: 6
+  num_kv_heads: 6
+  num_layers: 12
+  seq_len: 1024
+  gradient_checkpointing: true
+trainer:
+  tracker:
+    project: "levanter"
+    tags: [ "pile", "llama", "itest"]
+
+  mp: p=f32,c=bfloat16
+  model_axis_size: 1
+
+  train_batch_size: 256
+  num_train_steps: 20000
+optimizer:
+  learning_rate: 1E-3
+  weight_decay: 0.1
+  warmup: 0.01
diff --git a/docs/Fine-Tuning.md b/docs/Fine-Tuning.md
index 0b7cecd58..ebe0377ea 100644
--- a/docs/Fine-Tuning.md
+++ b/docs/Fine-Tuning.md
@@ -406,7 +406,7 @@ def _get_data_source(path_or_id):
     if fsspec_utils.exists(path_or_id):
         return JsonDataset([path_or_id])
     else:
-        return levanter.data.dataset_from_hf(path_or_id, split="train")
+        return levanter.data.datasource_from_hf(path_or_id, split="train")
 ```
 
 Preprocessing in Levanter typically happens in two phases:
diff --git a/docs/Training-On-Your-Data.md b/docs/Training-On-Your-Data.md
index 381ab9dc3..3fac85a07 100644
--- a/docs/Training-On-Your-Data.md
+++ b/docs/Training-On-Your-Data.md
@@ -106,10 +106,6 @@ data:
 
 ### Mixture of Sources
 
-!!! warning
-
-    This feature is experimental and may change in the future.
-
 If you have multiple sources of data (e.g., multiple domains, or distinct subsets of data), you can use the `data` section of your training configuration to specify them:
 
 ```yaml
@@ -145,14 +141,14 @@ validation data.
 ## Data Preprocessing
 
 Levanter supports both online and offline preprocessing. Online preprocessing is done on-the-fly
-during training. With online preprocessing, you don't need to think about preprocessing your data.
+during training. With online preprocessing, you don't need to think about preprocessing your data
+except to make sure it's in the right format and where you'd like to store the cached preprocessing
+results.
 
 Our data loading pipeline will automatically break and concatenate documents into chunks equal
 to the model's `seq_len` parameter. It will also automatically add special tokens to the
 end of documents.
 
-We don't yet handle sequence-to-sequence tasks, but we plan to.
-
 ### Online Preprocessing
 
 We have a sophisticated caching mechanism using [Ray](https://docs.ray.io/en/latest/)
@@ -160,8 +156,7 @@ that builds a cache of preprocessed data on the fly. Online caching happens tran
 in the background, using the mostly-idle CPU-cores of the machine(s) you are training on.
 
 The cache that is built is fully reproducible, and can be used for future training runs.
-Training will start as soon as each training machine has its first shard of data cached
-and once the validation data is cached.
+Training will start as soon as the system has the data it needs.
 
 ### Offline Preprocessing
 
@@ -190,19 +185,28 @@ python -m levanter.main.cache_dataset \
 ### Direct Cache Construction
 
 As a final option, you can directly construct a cache of preprocessed data without using Ray. This is useful if you
-have custom preprocessing logic or Ray isn't working for you for some reason. To do so, you can use [levanter.data.SerialCacheWriter][]
+have custom preprocessing logic or Ray isn't working for you for some reason. To do so, you can use [levanter.store.SerialCacheWriter][]
 to write batches directly. Here's an example:
 
 ```python
-from levanter.data import SerialCacheWriter
+import numpy as np
+
+from levanter.store import SerialCacheWriter
+
+exemplar = {
+    "input_ids": np.zeros((0), dtype=np.int32),
+    "attention_mask": np.zeros((0), dtype=np.int32),
+    "labels": np.zeros((0), dtype=np.int32),
+}
 
-with SerialCacheWriter(cache_dir, rows_per_chunk=1024) as writer:
+with SerialCacheWriter(cache_dir, exemplar) as writer:
     for batch in process_batches():
+        # batch should be a list of dicts, each with keys "input_ids", "attention_mask", and "labels"
         writer.write_batch(batch)
 ```
 
-`batch` can be a `list[dict]`, `dict[list]`, or `pyarrow.RecordBatch`. To work with `train_lm`, it should have an
-`input_ids` key that is a list of `int`s.
+In this case, `batch` should be a list of dicts, each with keys `"input_ids"`, `"attention_mask"`, and `"labels"`.
+To work with `train_lm`, it should have an `input_ids` key that is a list of `int`s.
 
 To use a cache like this, you can use the `passthrough` tokenizer:
 
diff --git a/docs/design/Data-Loader-Design.md b/docs/design/Data-Loader-Design.md
index e9386a762..b000e93c0 100644
--- a/docs/design/Data-Loader-Design.md
+++ b/docs/design/Data-Loader-Design.md
@@ -1,254 +1,174 @@
 # Data Loader Design
-## Design as of 2023-04-18
 
-### Goals
+## Context
 
-We want to support the following:
-1) Deterministic batches, even for a changing number of readers (or writers). That is, for any cluster size
-during training, we want the same batches to be generated in the same order.
-2) Sharded reading and writing. We want to be able to read and write from multiple shards in parallel.
-3) Simultaneous reading and writing of shards. We want to be able to start training while we are still building the cache.
-4) Fast resumption without losing too much progress. This applies to both *writing* and *reading* the cache. That is, when we
-   resume a training run, we want to finish producing the cache and also jump to the right place in the cache for reads.
-5) (eventually) shuffling/random access
-6) We want to be able to build the cache offline too.
-7) We want to support batches that are composed of fragments of documents. In particular, we take a moving window of tokens
-   from documents. This implies that the mapping from "documents" to "batches" is not 1:1, or easy to compute.
+Levanter, like any LM training framework, needs to read (usually text) data to feed it to the model. This
+process involves reading lots of raw text, tokenizing it, and splitting it up into model-sized chunks.
+Unlike many other ML workloads, the mapping from raw data to model-sized chunks is not 1:1, but in general
+many-to-many. This is because we typically take a moving window of tokens from a list of documents.
 
-We want to support the following use cases:
-1) We have a larger training dataset, and we want to draw samples from it more or less independently on a large number of machines.
-   We don't really care about "epochs"/"passes", but we do want to be able to handle resumes and be deterministic. Ideally, each
-   machine only reads from the chunks that it needs to read from.
-2) We have a smaller validation dataset, and we want to do a single pass over it. We don't care about resuming, and it's ok if
-we have to read the whole dataset on each machine.
-3) (Eventually) Like (1) but we want to jump around the dataset. We still care about resuming and determinism, but don't care about epochs.
+Levanter is designed to be completely deterministic, meaning that if you run the same code on the same data on
+the same hardware, you should get the same results. This is important for debugging and for reproducibility.
+In order to guarantee determinism, our data loading pipeline must be deterministic as well.
+Moreover, to the extent possible, we want deterministic batch order even if the number of machines changes.
 
-We focus on (1) and (2) for now.
+Data is usually stored in compressed shards, each morally equivalent to an iterator over a list of documents.
+In particular, we don't usually have random access. This implies that we need to produce a cache of processed
+documents that does allow for random access. Random access is important for resuming training quickly,
+as well as for shuffling.
 
+Early on in Levanter's development, we made the decision to support "quick start" training, where we can start
+training while we are still building the cache. This is helpful when iterating on the data pipeline
+and removes a step from the training process. This implies that we need to support simultaneous reading and writing
+of the cache.
 
-## Some terminology
+Levanter also wants to support dynamic mixtures of data, where we reweight different datasets on the fly. To do so,
+we need separate caches for each dataset.
 
-* **Shard**: A shard is a list of *raw* documents that not been tokenized/preprocessed.
-* **Chunk**: A chunk is a list of *processed* documents that have been tokenized/preprocessed.
-* **Reader**: A reader is a process that reads from the cache. Typically there is one reader per machine.
-* **Writer**: A writer is a process that writes to the cache. Typically there is one writer per machine.
-* **Global ordering**: The global ordering is the ordering of chunks in the cache. This is the order in which
-  documents are read by readers. The global ordering is defined with respect to an "idealized" number of readers R*. (See below.)
-* **Processor** or **Tokenizer**: A function that takes a raw document and returns a processed document.
-* **Example** is a single datum that is fed into the model. Examples are typically composed of fragments of documents.
-  For example, we might take a moving window of tokens from the concatenation of a list of preprocessed documents.
-
-
-We say there are K input shards, W writers, R readers. We assume K >= W (though typically K is not too large), and W ≈ R.
-We produce N chunks. We also define an idealized number of readers R*, which defines the global ordering over the data.
-Typically R* should be the maximum number of readers we expect to actually use.
-
-
-## Cache structure
-We define a shard cache as a list of "chunks", where each chunk is a parquet file (plus metadata) with an equal
-number of documents (except for the last chunks for each shard.)
-Each chunk is a list of processed documents. Chunks are ordered round robin from the input shards, so that the c'th global chunk is the
-c%K'th chunk of the c/K'th shard, so long as all shards have at least c/K chunks. (After that, we remove shards that
-have been exhausted and continue round robin.)
-We keep the following metadata:
-* For each shard, we keep a list of chunks written so far and whether or not we are done processing that shard.
-* For each chunk, we keep the number of documents, token counts/length of various fields, and the number of bytes.
-  (This metadata can be used for seeking.)
-* For the cache overall, we keep the global ordering of chunks, the number of chunks, and the number of documents.
-
-### Chunk format
-
-A Chunk is an Apache Parquet file with schema dependent on the task. For example, for language modeling, we might have
-just a sequence of input_ids per document. We use Apache Parquet because it's compact and doesn't require us to know
-much about the datatypes we're using.
-
-Chunks also have metadata stored in a separate json file. This metadata includes the total number of documents in the
-chunk, as well as token counts/lengths of various fields. This metadata is used for seeking.
-
-## Cache construction
-
-We use Ray to handle the construction of the cache. There are 4 types of processes/actors that we create using Ray:
-
-* A ChunkCacheBroker actor, whose job is to dispense chunks to readers while the cache is being built. It is also
-  responsible for keeping track of the global ordering of chunks.
-* A ChunkCacheBuilder actor, which is responsible for building the cache. It forks off processes for processing
-   input shards. It acts as a callback for these processes, and accepts chunk metadata from them.
-* Shard writer processes, one per input shard. The function _produce_cache_for_shard is the entry point for these processes.
-  This function is responsible for reading from the input shard and forking off processes to process chunks of documents.
-* Chunk processor processes, which are responsible for processing documents and creating chunks. _produce_chunk is the
-  entry point for these processes.
-
-Readers are managed by the model training processes, which read by sending requests to the broker via the Ray. They
-are not themselves Ray actors/processes.
+In practice, even for the relatively "small" examples one has in LM training (compared to vision, for example),
+we also want to do sharded loading.
 
-## Reproducible Sharded Reading for Training
+## Goals
 
-We want to be able to read from the cache in a way that is deterministic and reproducible, even if the number of readers
-changes. We also want readers to only read from the chunks that they need to read from.
-We pretend the list of data is infinite by cycling. We do track epochs when reading this way.
+To summarize:
 
-NB Our goal is a deterministic ordering over examples, and not merely chunks or even documents.
+* **Deterministic batches**: For any cluster size during training, we want the same batches to be
+  generated in the same order.
+* **Instant Resume**: We want training to be able to resume training quickly, without losing too much progress.
+* **Quick Start**: Unless it is logically impossible (e.g. for shuffling), we want to be able to start training
+  while we are still building the cache.
+* **Random Access**: We want to be able to jump around the dataset, for shuffling and for resuming.
 
-Given a list of chunks and the idealized number of readers R*, we define the global ordering over chunks as follows:
-First define R* iterators over chunks, with `chunk_iterators[r]` being defined as `loop(all_chunks)[r::R*]`.
+## Cache Design
 
-Next, define a function `mk_examples(chunk_iterator)` that takes a list of iterators over chunks and returns
-a list of examples. Define `chunk_examples[r] = mk_examples(chunk_iterators[r])`.
-This function depends on our sequence length, etc. Then the ordering over examples is:
+### Terminology
 
-`chunk_examples[0][0], chunk_examples[1][0], ..., chunk_examples[R*-1][0], ..., chunk_examples[0][1], chunk_examples[1][1], ..., chunk_examples[R*-1][1], ...`
-that is, `example[i] == chunk_examples[i % R*][i // R*]`
-
-If we have $R*$ readers, then each `reader_iterator[r][j] == chunk_examples[r][j] == example[j * R* + r]`.
-Moreover, if either R or R* is a multiple of the other, then we still get a nice property where
-each reader reads from a strided slice of the chunk_iterators:
-
-(Boring math)
-* If we have R readers, then `reader_iterator[r][j] == example[j * R + r] == chunk_examples[(j * R + r) % R*][(j * R + r) // R*]`
-* If we have `R == n * R*`, then `reader_iterator[r][j] == example[j * R + r] == chunk_examples[(j * R + r) % R*][(j * R + r) // R*]
- == chunk_examples[r % R*][(j * n * R* + r) // R*] == chunk_examples[r % R*][j * n + r // R*],` so each reader reads from
-a strided slice (specifically `islice(..., r//R*, None, n)`)
-* If we have `R* == n * R`, then `reader_iterator[r][j] == example[j * R + r] == chunk_examples[(j * R + r) % R*][(j * R + r) // R*]
-== chunk_examples[R * (j % n) + r][(j * R + r) // R*]` and so each reader reads from n different chunk_exampless.
-so we round-robin over a slice of the chunk_examples.
-
-For other cases (R and R* don't divide each other), there's no simple relationship between the reader and chunk iterators
-and you end up reading from everywhere, but that's ok.
-
-# Single-Pass Reading for Evaluation
-When we want to do a single pass over the data, we don't cycle and we don't shuffle. We just read the data in order. Boring
-and simple.
-
-
-## Resuming
-
-We need to think about resuming in two cases: resuming writes and resuming reads.
-
-## Resuming Writes
-
-Resuming writes is relatively easy, since we can just keep track of the number of chunks written for each shard and the
-number of documents written for each chunk. Then you just skip to the appropriate document and start writing.
-
-### Resuming Reads
-
-We want to understand how to seek to the b'th batch.
+* **Document**: A document is a single datum that is fed into the model. Documents are typically tokenized and
+  preprocessed. For LM training, a document is just a string, but for other tasks, it might be more complex. For example,
+  there might be a "prompt" and a "response".
+* **Shard**: A shard is a list of *raw* documents that not been tokenized/preprocessed.
+* **Cache**: A cache is a list of *processed* documents that have been tokenized/preprocessed. These are stored as
+a group of [TensorStore](https://google.github.io/tensorstore/) arrays structured to behave like a column store. These
+arrays are stored as Zarray arrays, typically compressed.
+* **Reader**: A reader is a process that reads from the cache. Typically, there is one reader per machine.
+* **Writer**: A writer is a process that writes to the cache. Typically, there is one writer per *cache*.
+* **Global ordering**: Each document in a cache is assigned a global index. This index is deterministic, but
+a bit hard to compute a priori.
+* **Processor** or **Tokenizer**: A function that takes a batch of raw documents and returns a batch of processed documents.
+* **Example**: A single datum that is fed into the model. Examples are typically composed of fragments of documents.
+  For example, we might take a moving window of tokens from the concatenation of a list of preprocessed documents.
+* **Ledger**: A ledger is a list of metadata about the cache. This includes the number of documents in each shard
+as well as some information to make it less likely that you accidentally reuse a cache.
 
-There are two cases of resuming we need to think about:
 
-1) The "easy" case where 1 example == 1 (preprocessed) document.
-2) The "hard" case where the mapping from examples to documents is not 1:1, but there is some easily computable relationship.
+### Cache Structure
 
-In the first case, each reader `r` reads `documents[r::R]`. The `b`th batch
-is `documents[b * batch_size:(b+1) * batch_size]`. Assuming `batch_size % R == 0`, then for the b'th batch, reader r
-needs to read `documents[b * batch_size + r: (b+1) * batch_size + r: R] == docs(chunk_iterator[r])[b * batch_size // R:(b+1) * batch_size // R]`.
-If we know how many documents are in each chunk, then we can seek to the right place in the chunk.
+A cache is a [PyTree](https://jax.readthedocs.io/en/latest/pytrees.html) of [levanter.store.JaggedArray][]s, each
+representing a different field of the processed documents. Each JaggedArray is a group of either two or three arrays:
 
-The second case is broadly similar. In particular, we consider the case where we take moving windows of concatenated documents.
-If our metadata includes token counts, then we can skip chunks until we pass `batch_size * tokens_per_example // R` tokens.
+* **Data**: The actual data, stored as a Zarray array. All the "tokens" for a given field for all documents are stored in a single flat array.
+* **Offsets**: The offsets into the data array for each document. This is a 1-D array of length N+1, where N is the number of documents.
+* **Shape** (optional): The shape of the data for each document. This is only present for fields that are not 1-D.
 
+For tokenized documents, a cache looks like this:
 
-## Shuffling
+```
+cache
+├── train
+│   ├── input_ids
+│   │   ├── data
+│   │   │   ├── c
+│   │   │   │   └── 0
+│   │   │   └── zarr.json
+│   │   └── offsets
+│   │       ├── c
+│   │       │   └── 0
+│   │       └── zarr.json
+│   ├── shard_ledger.json
+```
 
-### A brief digression
-Why do we want to shuffle during training? Shuffling reduces variance in the gradients. If we have batches
-where every example is from the same document/domain, then the gradients for those batches will be correlated.
+(Typically there's a lot more files in the `c` directories, but I've omitted them for brevity.)
 
-That said, in our setting where we use moving windows from documents, if we round-robin from chunks (which are produced
-from different documents), and R* is roughly equal to the batch size, then we will read from a different chunk for every
-example in a batch, which reduces correlation within a batch.
+The stuff under `input_ids/data` is the actual data, and the stuff under `input_ids/offsets` is the offsets.
 
-However, we still have (undesirable) correlation between batches: if we
-read from chunks consecutively and our documents are long, then many examples in the next batch will be from the
-same document as an example in the previous batch. Ideally this wouldn't happen. I'm not convinced that it matters
-that much.
+In code, this is modeled in [levanter.store.TreeStore][].
 
-Proper shuffling is incompatible with streaming at a fundamental level. Our choices are something like:
+### Cache Construction
 
-* Randomly shuffle before preprocessing. Makes life a bit less pleasant for people with a new dataset. Can't be changed after preprocessing. Doesn't solve the problem of correlated batches.
-* Reservoir sampling. Makes resumes hard, but is easy to implement.
-* "Epochal" reservoir sampling, where we periodically "flush" the reservoir. Resumes are easier because you can start from the latest "epoch"
-* No shuffling in the first pass, but shuffle in subsequent passes.
-* Shuffle within a range of chunks that grows as the run progresses.
+We use Ray to handle the construction of the cache. There are 4 types of processes/actors that we create using Ray:
 
-My hunch is that we can skip this for now, and revisit if we find that it's a problem.
+- `_TreeStoreCacheBuilder`: This actor is responsible for building the cache. It forks off actors for reading
+  shards and processing documents. It acts as a callback for these processes.
+- `_OrderedCacheWriter`: This actor is responsible for writing to the cache. It is responsible for writing the
+  processed documents to the cache in the correct order.
+- `WorkQueueDispatcherActor`: This actor is responsible for reading batches of documents from a group of shards. It dispatches
+  documents to a group of processors, which are responsible for processing the documents.
+- `_BatchProcessorQueue`: This actor is responsible for managing the queue of batches of documents to be processed. It
+  actually calls the processors to process the documents and then forwards the results to the writer.
 
+The basic flow is that the builder forks off a bunch of `WorkQueueDispatcherActor`s, which read from the shards and
+dispatch the documents to the processors. The processors process the documents and send the results to the writer,
+which writes them to the cache.
 
-## Current Status as of 2022-10-10
+The writer is responsible for writing the documents to the cache in the correct order. In particular, fix a batch
+size B. The writer writes the documents in batches of size B, round-robin from the shards. Once a shard is exhausted,
+it is removed from the list of shards.
 
-The current data loader (in levanter/data/text.py and levanter/data/sharded.py) works as follows:
+The writer maintains a "ledger" of the cache, which has the number of documents processed in each shard, as well as
+whether or not the shard is done. This ledger is used for resuming cache construction.
 
-### TokenizedDocumentCache
-* We build a TokenizedDocumentCache, which creates a (user-specified) number of shards (default 128 for training). Documents are tokenized (via an HF tokenizer) and written to the cache in batches of 1000 (by default), with each batch being written to the *smallest* shard.
-* The underlying format is a Parquet file, for text data this means a sequence of input_ids stored in a batched columnar layout and compressed
-* When you iterate through the TokenizedDocumentCache, it reads the shards in a round-robin fashion, and yields batches of documents, as they were written.
-* It can optionally "flatten" a batch of documents into a single doc (which are delimited by eos), which is what we do with TokenSeqDataset.
+## Datasets and the Data Loader
 
+Along with the cache, we introduce interfaces and classes for working with the cache. The main classes are:
 
-### TokenSeqDataset
-* At load time, a TokenizedDocumentCache is typically wrapped in an TokenSeqDataset, which just wraps the
-TokenizedDocumentCache and sets a max_seq_len. This is the fundamental data structure that is used by the data loader.
-* The TokenSeqDataset iterates through the TokenizedDocumentCache one batch at a time. The docs are (implicitly)
-concatenated together. If a concatenated doc is longer than max_seq_len, then it is split into chunks of max_seq_len. Any left over at the end of a batch is currently thrown out, matching Mistral's behavior.
+- [levanter.data.AsyncDataset][]: This is the base class for all datasets. The main method it exposes is
+  `get_batch(indices: Sequence[int])` which (asynchronously) returns a batch of documents for the given indices.
+- [levanter.data.DataLoader][]: This is a class that wraps a dataset and provides an iterator over the dataset. It prefetches
+  the parts of batches that each machine needs. It has an iterator and supports "seeking" to a particular batch.
+- [levanter.store.TreeCache][]: This is an AsyncDataest that wraps a cache and exposes a `get_batch` method that returns
+  a batch of documents for the given indices.
+- [levanter.data.TokenSeqDataset][]: This is an async dataset that does the chunking of documents into examples. It
+  takes a cache and a `max_seq_len` and returns examples of length `max_seq_len`.
+- [levanter.data.PermutationDataset][]: This is a dataset that permutes the indices of another dataset. It is used for shuffling.
+- [levanter.data.EraShufflingDataset][]: This is a dataset that emulates the behavior of a shuffle buffer, while
+  still support random access. It is used for shuffling while still building the cache.
+- [levanter.data.MixtureDataset][]: This is a dataset that mixes together multiple datasets with different weights.
 
-### ShardedTokenSeqDataset
+### [levanter.data.PermutationDataset][]
 
-* Recall that model computation is done by creating a 2-D grid of TPUs, with the first axis being "data" and the other being "model". All devices on the same row process the same slice of a batch. Typically a row does not span multiple nodes, but usually a node will have multiple rows.
-* We can conceptually group the rows into "row groups" such that either a row group is just 1 row, or it spans all rows that are on the same node.
-* The job of the ShardedTokenSeqDataset is to shard the TokenSeqDataset into a number of shards and loads the data so that each row gets its own data. Each row group of the 2-d mesh is assigned a shard (i.e. a set of cache files) that it loads from exclusively.
-* For each batch, a node reads however many examples it needs to fill its row group. We then create a GlobalDeviceArray which orchestrates the shards together.
+The PermutationDataset is a dataset that permutes the indices of another dataset. It is backed by a pseudo-random
+permutation (PRP). PRPs give you random access to a permutation with O(1) time and memory.
 
-### Misc notes / problems
-* There's no randomness anywhere.
-* If documents are very long, this means we're reading from the same doc repeatedly for a batch, which is not ideal.
-* We get bitwise determinism so long as the grid configuration doesn't change.
-* Because we write to the smallest shard, with a large enough dataset, we should have roughly the same number of tokens in each shard, but it won't be exact.
-* Because of the above (and because I didn't know how to do it) we don't have a way for one process to signal that it's done. So we just loop the dataset forever. This isn't ideal for evaluation, if nothing else.
-* We haven't implemented seeking in the DataLoader, so resumes are expensive. This is not super hard in principle, but it's not implemented.
-* Mentioning again that we drop the last batch of a shard if it's not full. This is not ideal. We should pad it and return masks.
+### [levanter.data.EraShufflingDataset][]
 
-## Resumable, Streaming Dataset with good randomness
+The EraShufflingDataset is a dataset that emulates the behavior of a shuffle buffer, while still supporting random access.
+It works by defining an "era" length, which is the number of samples that are shuffled together. After an era is exhausted,
+the dataset shuffles the next era.
 
-Goal: a streaming dataset that is:
-1. disk-seek efficient (meaning we don't jump to a random position in a random shard for every sample)
-2. reasonably random, including robust to long documents.
-3. resumable (meaning it's relatively cheap to resume if a run crashes)
-4. doesn't eat too much disk
-5. fully replicable with same configuration (meaning that if you run the same code on the same data, you get the same results)
-6. (stretch) fully replicable with different configurations (meaning that if you run the same code on the same data, you get the same results, even if you change the number of nodes)
 
-It's easy to get (1) with streaming, and (2) by jumping to random offsets for every sample. Shuffle buffers get you (1)
-and (2) together, but only if documents aren't too long. (3) comes easily if you do streaming OR random jumping
-constantly, but is a bit painful with a shuffle buffer. You can get (1), (2) and (3) if you are willing to lay out the
-entire shuffled dataset on disk for every epoch. But that's not ideal.
+### [levanter.data.MixtureDataset][]
 
+We implement "stable mixtures" where the number of samples from each domain for each batch is fixed. This acts
+as a kind of variance reduction, while also enabling random access and sampling without replacement.
 
-For (1)-(4), we take a middle path: we stream into a shuffle buffer, but we jump to random offsets after every K samples. Moreover,
-we serialize the shuffle buffer, the positions of the datasets, and the random seed to disk when we checkpoint, so that we can resume easily.
+Note: I believe it's impossible to sample without replacement and have random access with sampled batches.
+This is because for each item `i`, you sample a domain `d_i`, but you can't know which indices in the domain have
+been sampled. With replacement is easy so long as you know how big each domain is ahead of time, which means
+you can't do streaming.
 
-(5) is easy if you serialize the relevant state, or can just restart your iterators deterministically.
-(6) is hard to do in a sharded way. It's easy to "scale down" by emulating a larger number of nodes with a smaller
-number of nodes, but it's hard to "scale up". To do this, we can think of each row as having its own stream of data,
-perhaps sliced out of a larger stream? TODO for version 3
 
+## Performance
 
-### Tasks
+### Reading from the Cache
 
-#### TokenSeqDataset
-* [] supports seek that jumps to a random offset in a random shard in the TokenizedDocumentCache
-* [] can report current position for serialization
-* [] can resume from a position
+TensorStore can sustain high throughput but has pretty terrible latency (when hitting GCS).
+The latency can be on the order of a second. We mitigate this by prefetching the data in the DataLoader.
 
-#### JumpingDataset
-* [] has a policy for jumping around in an TokenSeqDataset
-* [] has a random key and a position within the policy
-* [] can report key and current position for serialization
+With prefetching we can sustain about a million tokens per second per host, wihch is sufficient.
+In particular, when training a GPT-2 small model on a v3-32, loading is able to keep up with training.
+However, 3/4 of evaluation time is spent blocked on loading data, so we could potentially speed up evaluation.
+(However it's still twice as fast as with the old cache and data loader.)
 
-#### ShuffleBufferDataset
-* [] has a shuffle buffer of size ≤N
-* [] has a random key
-* [] can report key and shuffle buffer for serialization
+### Writing to the Cache
 
-#### Misc
-* [] dataset hierarchy that exposes the interfaces we need (tree_leaves probably for serialization?)
-* [] serialize the dataset on all nodes. This logic might need to be a bit different than for models, since the models all use GDAs and only write plain old arrays once.
-* [] make sure we can resume from a checkpoint with bitwise determinism as before
+Writes are also slow, but we also batch up the writes, typically writing 8K documents at a time.
diff --git a/examples/alpaca-lora/alpaca_lora.py b/examples/alpaca-lora/alpaca_lora.py
index 9488809ba..000b5a715 100644
--- a/examples/alpaca-lora/alpaca_lora.py
+++ b/examples/alpaca-lora/alpaca_lora.py
@@ -124,7 +124,7 @@ def loraize_hf_model(model):
         # Levanter has two kinds of data loaders: sharded and replicated. Replicated is simpler and allows for
         # single pass training. Sharded only loads a subset of the data on each device, and is more efficient for large
         # datasets. We use replicated here since the dataset is small.
-        loader = trainer.replicated_loader(train_dataset, trainer.TrainBatch)
+        loader = trainer.data_loader(train_dataset, trainer.TrainBatch)
         loader = non_caching_cycle(loader)
 
         if int(state.step) != 0:
diff --git a/examples/alpaca/alpaca.py b/examples/alpaca/alpaca.py
index 6578bc46c..a2201de76 100644
--- a/examples/alpaca/alpaca.py
+++ b/examples/alpaca/alpaca.py
@@ -15,8 +15,7 @@
 
 import levanter
 from levanter.compat.hf_checkpoints import HFCheckpointConverter, save_hf_checkpoint_callback
-from levanter.data import Dataset
-from levanter.data.sharded_dataset import JsonDataset, JsonlDataset, WrappedHFDataset
+from levanter.data import PermutationDataset
 from levanter.models.lm_model import LmExample, LmHeadModel, compute_next_token_loss
 from levanter.optim import OptimizerConfig
 from levanter.trainer import Trainer, TrainerConfig
@@ -100,53 +99,21 @@ class TrainArgs:
     hf_save_steps: int = 1000  # How often to save the HuggingFace checkpoint.
 
 
-# Encoder/Decoder dataset for Alpaca.
-# We basically do string interpolation of the (instruction, input, output) triples with the prompt,
-# and mask out the prompt and padding.
-class SupervisedDataset(Dataset[LmExample]):
-    def __init__(self, preproc_dataset, tokenizer, mask_inputs):
-        self.preproc_dataset = preproc_dataset
-        self.tokenizer = tokenizer
-        self.mask_inputs = mask_inputs
-
-    def __iter__(self):
-        for ex in self.preproc_dataset:
-            # annoyingly, pad expects things to be batched so we have to prepend a batch axis
-            ex = self.tokenizer.pad(
-                {k: np.expand_dims(v, 0) for k, v in ex.items()}, return_tensors="np", padding="max_length"
-            )
-            ex = {k: v[0] for k, v in ex.items()}
-            input_ids = hax.named(ex["input_ids"], "position")
-
-            # mask out padding and anything before the start of the target
-            Pos = input_ids.resolve_axis("position")
-            if self.mask_inputs:
-                loss_mask = hax.arange(Pos) >= ex["source_lens"]
-
-                # don't predict the padding
-                targets = hax.roll(input_ids, -1, Pos)
-                loss_mask = loss_mask & (targets != self.tokenizer.pad_token_id)
-            else:
-                loss_mask = 1 - hax.nn.one_hot(-1, Pos, dtype=jax.numpy.float32)
-
-            yield LmExample(input_ids, loss_mask)
-
-
 def _get_data_source(path_or_id):
     """The original alpaca.py used a json file, but it's since been moved to the HF dataset hub. You can use any
     dataset that's compatible with the structure of the alpaca dataset."""
     if fsspec_utils.exists(path_or_id):
         # we're a bit generous here b/c we support compression
         if ".jsonl" in path_or_id:
-            return JsonlDataset([path_or_id])
+            return levanter.data.datasource_from_jsonl([path_or_id])
         elif ".json" in path_or_id:
-            return JsonDataset([path_or_id])
+            return levanter.data.datasource_from_json([path_or_id])
         else:
             raise ValueError(
                 f"We only support HF Datasets or a data file with .json or .jsonl extensions, not {path_or_id}!"
             )
     else:
-        return WrappedHFDataset(path_or_id, split="train")
+        return levanter.data.datasource_from_hf(path_or_id, split="train")
 
 
 def mk_dataset(config: TrainArgs, tokenizer: transformers.PreTrainedTokenizerBase):
@@ -175,12 +142,37 @@ def format_example(ex):
             "source_lens": source_lens,
         }
 
-    dataset = dataset.map_batches(preprocess, batch_size=128, num_cpus=num_cpus_used_by_tokenizer(tokenizer))
-    dataset = dataset.build_or_load_cache(config.data_cache_dir, await_finished=True)
-
-    dataset = SupervisedDataset(dataset, tokenizer, mask_inputs=config.mask_inputs)
+    dataset = dataset.map_batches(preprocess, batch_size=128, num_cpus=num_cpus_used_by_tokenizer(tokenizer))  # type: ignore
+    dataset = dataset.build_or_load_cache(config.data_cache_dir, await_finished=True)  # type: ignore
+
+    def _prepare_example(ex: dict) -> LmExample:
+        """
+        Prepare an example for training. This function converts the (cached) batch encoding into an LmExample.
+
+        It goes through the following steps:
+
+        1. Pad the batch to the maximum length.
+        2. Mask out the input and prompt if requested.
+        3. Create an LmExample with the input_ids as the input and the next token as the target.
+        """
+        # annoyingly, pad expects things to be batched so we have to prepend a batch axis
+        ex = tokenizer.pad({k: np.expand_dims(v, 0) for k, v in ex.items()}, return_tensors="np", padding="max_length")
+        ex = {k: v[0] for k, v in ex.items()}
+        input_ids = hax.named(ex["input_ids"], "position")
+        # mask out padding and anything before the start of the target
+        Pos = input_ids.resolve_axis("position")
+        if config.mask_inputs:
+            loss_mask = hax.arange(Pos) >= ex["source_lens"]
+
+            # don't predict the padding
+            targets = hax.roll(input_ids, -1, Pos)
+            loss_mask = loss_mask & (targets != tokenizer.pad_token_id)
+        else:
+            loss_mask = 1 - hax.nn.one_hot(-1, Pos, dtype=jax.numpy.float32)
+        lm_ex = LmExample.causal(input_ids, loss_mask=loss_mask)
+        return lm_ex
 
-    return dataset
+    return dataset.map(_prepare_example)
 
 
 def get_prompts(prompt_path) -> dict:
@@ -208,7 +200,7 @@ def train(config: TrainArgs):
         )
 
     # Randomness in JAX is tightly controlled. We pass around a key that is used to generate random numbers.
-    training_key = jrandom.PRNGKey(config.trainer.seed)
+    training_key, data_key = jrandom.split(jrandom.PRNGKey(config.trainer.seed), 2)
 
     # This is largely the same as in Alpaca. Only change is we use the fast tokenizer.
     tokenizer = transformers.AutoTokenizer.from_pretrained(
@@ -224,6 +216,7 @@ def train(config: TrainArgs):
     converter = converter.replaced(tokenizer=tokenizer)
 
     train_dataset = mk_dataset(config, tokenizer)
+    train_dataset = PermutationDataset(train_dataset, data_key)
 
     optimizer = config.optimizer.build(config.trainer.num_train_steps)
 
@@ -243,11 +236,12 @@ def train(config: TrainArgs):
         # Levanter has two kinds of data loaders: sharded and replicated. Replicated is simpler and allows for
         # single pass training. Sharded only loads a subset of the data on each device, and is more efficient for large
         # datasets. We use replicated here since the dataset is small.
-        loader = trainer.replicated_loader(train_dataset, trainer.TrainBatch)
+        loader = trainer.data_loader(train_dataset, trainer.TrainBatch)
         loader = non_caching_cycle(loader)
 
         state = trainer.initial_state(training_key, model=model)
 
+        # TODO: remove this. we don't need it now
         if int(state.step) != 0:
             logger.info(f"Resuming training from step {state.step}")
             for i in range(state.step):
diff --git a/examples/gsm8k-lora/gsm8k_lora.py b/examples/gsm8k-lora/gsm8k_lora.py
index b7ac3945c..97a9c06ab 100644
--- a/examples/gsm8k-lora/gsm8k_lora.py
+++ b/examples/gsm8k-lora/gsm8k_lora.py
@@ -14,9 +14,8 @@
 import haliax as hax
 
 import levanter
-from levanter.data import Dataset
-from levanter.data.dataset import ShuffleDataset
-from levanter.data.sharded_dataset import WrappedHFDataset
+from levanter.data import PermutationDataset
+from levanter.data.sharded_datasource import WrappedHFDataSource
 from levanter.lora import (
     LoraConfig,
     lora_trainable_params_filter,
@@ -67,37 +66,8 @@ class TrainArgs:
     merged_hf_upload: Optional[str] = None
 
 
-class SupervisedDataset(Dataset[LmExample]):
-    def __init__(self, preproc_dataset, tokenizer, mask_inputs):
-        self.preproc_dataset = preproc_dataset
-        self.tokenizer = tokenizer
-        self.mask_inputs = mask_inputs
-
-    def __iter__(self):
-        for ex in self.preproc_dataset:
-            # annoyingly, pad expects things to be batched so we have to prepend a batch axis
-            ex = self.tokenizer.pad(
-                {k: np.expand_dims(v, 0) for k, v in ex.items()}, return_tensors="np", padding="max_length"
-            )
-            ex = {k: v[0] for k, v in ex.items()}
-            input_ids = hax.named(ex["input_ids"], "position")
-
-            # mask out padding and anything before the start of the target
-            Pos = input_ids.resolve_axis("position")
-            if self.mask_inputs:
-                loss_mask = hax.arange(Pos) >= ex["source_lens"]
-
-                # don't predict the padding
-                targets = hax.roll(input_ids, -1, Pos)
-                loss_mask = loss_mask & (targets != self.tokenizer.pad_token_id)
-            else:
-                loss_mask = 1 - hax.nn.one_hot(-1, Pos, dtype=jax.numpy.float32)
-
-            yield LmExample.causal(input_ids, loss_mask=loss_mask)
-
-
 def mk_dataset(config: TrainArgs, tokenizer: transformers.PreTrainedTokenizerBase):
-    dataset = WrappedHFDataset("gsm8k", split="train", name="main")
+    dataset = WrappedHFDataSource("gsm8k", split="train", name="main")
 
     def preprocess(batch):
         def format_example(ex):
@@ -125,9 +95,34 @@ def format_output(ex):
     dataset = dataset.map_batches(preprocess, batch_size=128, num_cpus=num_cpus_used_by_tokenizer(tokenizer))  # type: ignore
     dataset = dataset.build_or_load_cache(config.data_cache_dir, await_finished=True)  # type: ignore
 
-    dataset = SupervisedDataset(dataset, tokenizer, mask_inputs=config.mask_inputs)  # type: ignore
+    def _prepare_example(ex: dict) -> LmExample:
+        """
+        Prepare an example for training. This function converts the (cached) batch encoding into an LmExample.
+
+        It goes through the following steps:
+
+        1. Pad the batch to the maximum length.
+        2. Mask out the input and prompt if requested.
+        3. Create an LmExample with the input_ids as the input and the next token as the target.
+        """
+        # annoyingly, pad expects things to be batched so we have to prepend a batch axis
+        ex = tokenizer.pad({k: np.expand_dims(v, 0) for k, v in ex.items()}, return_tensors="np", padding="max_length")
+        ex = {k: v[0] for k, v in ex.items()}
+        input_ids = hax.named(ex["input_ids"], "position")
+        # mask out padding and anything before the start of the target
+        Pos = input_ids.resolve_axis("position")
+        if config.mask_inputs:
+            loss_mask = hax.arange(Pos) >= ex["source_lens"]
+
+            # don't predict the padding
+            targets = hax.roll(input_ids, -1, Pos)
+            loss_mask = loss_mask & (targets != tokenizer.pad_token_id)
+        else:
+            loss_mask = 1 - hax.nn.one_hot(-1, Pos, dtype=jax.numpy.float32)
+        lm_ex = LmExample.causal(input_ids, loss_mask=loss_mask)
+        return lm_ex
 
-    return dataset
+    return dataset.map(_prepare_example)
 
 
 def train(config: TrainArgs):
@@ -151,7 +146,7 @@ def train(config: TrainArgs):
         data_key = jrandom.PRNGKey(config.data_seed)
 
     train_dataset = mk_dataset(config, tokenizer)
-    train_dataset = ShuffleDataset(train_dataset, data_key, buffer_size=1000 * 1000)
+    train_dataset = PermutationDataset(train_dataset, data_key)
 
     optimizer = config.optimizer.build(config.trainer.num_train_steps)
 
@@ -196,7 +191,7 @@ def loraize_hf_model(model):
         # Levanter has two kinds of data loaders: sharded and replicated. Replicated is simpler and allows for
         # single pass training. Sharded only loads a subset of the data on each device, and is more efficient for large
         # datasets. We use replicated here since the dataset is small.
-        loader = trainer.replicated_loader(train_dataset, trainer.TrainBatch)
+        loader = trainer.data_loader(train_dataset, trainer.TrainBatch)
         loader = non_caching_cycle(loader)
 
         if int(state.step) != 0:
diff --git a/pyproject.toml b/pyproject.toml
index c94ec5a6a..8712d16a3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,7 @@ classifiers = [
 ]
 dependencies = [
     "haliax>=1.4.dev307",
-    "equinox>=0.11.4",
+    "equinox==0.11.4",
     "jaxtyping>=0.2.20",
     "tokenizers>=0.15.2",
     "transformers>=4.41.2",
@@ -49,6 +49,7 @@ dependencies = [
     "rich~=13.0",
     "filelock~=3.13",
     #    "ai2-olmo",
+    "async-lru~=2.0",
 ]
 
 [project.urls]
diff --git a/scripts/repair_cache.py b/scripts/repair_cache.py
deleted file mode 100644
index 49f9e36c6..000000000
--- a/scripts/repair_cache.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import os
-import os.path
-from dataclasses import dataclass
-from typing import List
-
-import fsspec
-import pyarrow
-from tqdm import tqdm
-
-import levanter
-from levanter.data.shard_cache import LEDGER_FILE_NAME, CacheLedger, ChunkMetadata, _serialize_json_and_commit
-
-
-@dataclass
-class RepairCacheArgs:
-    cache_path: str
-
-
-@levanter.config.main()
-def main(args: RepairCacheArgs):
-    """Repairs a broken cache by recreating the ledger"""
-    for split in ["train", "validation"]:
-        # find train files in the dir, which can be in cloud
-        fs = fsspec.get_fs_token_paths(args.cache_path)[0]
-        paths = os.path.join(args.cache_path, split, "*.parquet")
-        files = fs.glob(paths)
-
-        # We're basically redoing this, but without the old ledger:
-        chunks: List[ChunkMetadata] = []
-
-        pbar = tqdm(files)
-        total_input_ids = 0
-        for file in pbar:
-            file = f"gs://{file}"
-            table = pyarrow.parquet.read_metadata(file)
-
-            input_ids = 0
-            for g in range(table.num_row_groups):
-                input_ids += table.row_group(g).column(0).statistics.num_values
-
-            file = file.replace(os.path.join(args.cache_path, split), "").lstrip("/")
-
-            chunks.append(
-                ChunkMetadata(
-                    name=file.replace(".parquet", ""),
-                    num_rows=table.num_rows,
-                    field_counts={"input_ids": input_ids},
-                )
-            )
-
-            total_input_ids += input_ids
-
-            pbar.set_postfix(num_rows=table.num_rows, total_input_ids=total_input_ids)
-
-        ledger = CacheLedger(chunks=chunks)
-        _serialize_json_and_commit(os.path.join(args.cache_path, split, LEDGER_FILE_NAME), ledger)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/levanter/callbacks.py b/src/levanter/callbacks.py
index 406a7b39a..21aaf5faa 100644
--- a/src/levanter/callbacks.py
+++ b/src/levanter/callbacks.py
@@ -8,13 +8,14 @@
 import threading
 import time
 import warnings
-from typing import Callable, Iterable, Optional
+from typing import Callable, Optional
 
 import humanfriendly
 import jax
 from tqdm import tqdm
 
 import levanter.tracker
+from levanter.data import DataLoader
 from levanter.logging import save_xla_dumps_to_wandb
 from levanter.tracker.helpers import log_optimizer_hyperparams
 from levanter.tracker.wandb import WandbConfig
@@ -69,7 +70,7 @@ def eval_loss_loop(loss_fn, model, dataset, max_batches: Optional[int] = None, n
 
 def compute_validation_loss(
     loss_fn: Callable,  # [[M, ...], jax.numpy.ndarray],
-    dataset: Iterable,
+    dataset: DataLoader,
     max_batches: Optional[int] = None,
     name: Optional[str] = None,
 ):
diff --git a/src/levanter/checkpoint.py b/src/levanter/checkpoint.py
index 7802a7f07..b102198d7 100644
--- a/src/levanter/checkpoint.py
+++ b/src/levanter/checkpoint.py
@@ -5,7 +5,6 @@
 import os
 import pathlib
 import queue
-import sys
 import threading
 import time
 import urllib.parse
@@ -110,7 +109,7 @@ def __init__(
         self._manager = GlobalAsyncCheckpointManager(timeout_secs=60 * 30)
 
         if jax.process_index() == 0:
-            self._async_checkpoint_remover_queue: queue.Queue[str] = queue.Queue()
+            self._async_checkpoint_remover_queue: queue.Queue[str] = queue.Queue(maxsize=-1)
             self._async_checkpoint_remover_thread = threading.Thread(
                 target=self._async_checkpoint_remover, daemon=True
             )
@@ -224,7 +223,7 @@ def wait_until_finished(self):
 
     def _rm_checkpoint(self, checkpoint):
         if jax.process_index() == 0:
-            print(f"Removing checkpoint {checkpoint}", file=sys.stderr, flush=True)
+            logger.info(f"Removing checkpoint {checkpoint}")
             self._async_checkpoint_remover_queue.put(checkpoint)
 
     def _do_rm_checkpoint(self, checkpoint):
diff --git a/src/levanter/compat/hf_checkpoints.py b/src/levanter/compat/hf_checkpoints.py
index 5727f4360..ce267041c 100644
--- a/src/levanter/compat/hf_checkpoints.py
+++ b/src/levanter/compat/hf_checkpoints.py
@@ -975,7 +975,7 @@ def select_if_missing(missing_leaf, new_value):
             else:
                 return None
 
-        return jax.tree_map(select_if_missing, dtype_structs, new_model, is_leaf=lambda x: x is None)
+        return jax.tree.map(select_if_missing, dtype_structs, new_model, is_leaf=lambda x: x is None)
 
     new_buffers = _init_buffers()
 
diff --git a/src/levanter/data/__init__.py b/src/levanter/data/__init__.py
index 534ec6dbf..85d99f8ab 100644
--- a/src/levanter/data/__init__.py
+++ b/src/levanter/data/__init__.py
@@ -1,22 +1,24 @@
-from levanter.data.dataset import Dataset, ShardableDataset, ShuffleDataset
-from levanter.data.loader import BatchLoader, ReplicatedBatchLoader, ShardedBatchLoader
-from levanter.data.shard_cache import SerialCacheWriter, ShardCache, build_or_load_cache
-from levanter.data.sharded_dataset import ShardedDataset, dataset_from_hf, dataset_from_jsonl
-from levanter.data.utils import batched
+from ._preprocessor import BatchProcessor
+from .dataset import AsyncDataset, ListAsyncDataset, MappedAsyncDataset, SyncDataset
+from .loader import DataLoader
+from .mixture import MixtureDataset, StopStrategy
+from .permutation import EraShufflingDataset, PermutationDataset
+from .sharded_datasource import ShardedDataSource, datasource_from_hf, datasource_from_json, datasource_from_jsonl
+from .utils import batched
 
 
 __all__ = [
     "batched",
-    "Dataset",
-    "ShardableDataset",
-    "ShuffleDataset",
-    "BatchLoader",
-    "ReplicatedBatchLoader",
-    "ShardedBatchLoader",
-    "build_or_load_cache",
-    "ShardCache",
-    "ShardedDataset",
-    "SerialCacheWriter",
-    "dataset_from_hf",
-    "dataset_from_jsonl",
+    "ShardedDataSource",
+    "datasource_from_hf",
+    "datasource_from_jsonl",
+    "datasource_from_json",
+    "BatchProcessor",
+    "AsyncDataset",
+    "MappedAsyncDataset",
+    "SyncDataset",
+    "ListAsyncDataset",
+    "DataLoader",
+    "MixtureDataset",
+    "StopStrategy",
 ]
diff --git a/src/levanter/data/_preprocessor.py b/src/levanter/data/_preprocessor.py
index 08e287c54..9ee1e2dc2 100644
--- a/src/levanter/data/_preprocessor.py
+++ b/src/levanter/data/_preprocessor.py
@@ -17,7 +17,7 @@
 """
 
 
-class BatchProcessor(Generic[T_contra], ABC):
+class BatchProcessor(Generic[T_contra, U], ABC):
     """
     A BatchProcessor is the main interface for preprocessing data. It takes a batch of data and returns a batch of
     processed data. It can be used to tokenize data, convert it to a RecordBatch, or do any other kind of preprocessing.
@@ -25,7 +25,7 @@ class BatchProcessor(Generic[T_contra], ABC):
     """
 
     @abstractmethod
-    def __call__(self, batch: Sequence[T_contra]) -> BatchResult:
+    def __call__(self, batch: Sequence[T_contra]) -> Sequence[U] | U:  # U can be batched "structure of arrays" form
         """
         Process a batch of data. You should return either a RecordBatch, a sequence of dicts (one per output
         example), or a dict of sequences (one per output field).
@@ -34,6 +34,14 @@ def __call__(self, batch: Sequence[T_contra]) -> BatchResult:
         """
         raise NotImplementedError
 
+    @property
+    @abstractmethod
+    def output_exemplar(self):
+        """
+        An exemplar of what this processor returns. This is used to determine the output schema of a dataset.
+        """
+        raise NotImplementedError
+
     @property
     def resources(self) -> Dict[str, float]:
         """Any resources that this processor needs to run. Ray uses this to schedule tasks."""
@@ -113,7 +121,7 @@ def _construct_composite_batch_processor(dataset):
     """
 
     def rec(dataset):
-        from levanter.data.sharded_dataset import _TransformedDataset
+        from levanter.data.sharded_datasource import _TransformedDataset
 
         if isinstance(dataset, _TransformedDataset):
             source, transforms, batch_transform = rec(dataset.source)
@@ -165,6 +173,10 @@ def num_gpus(self):
     def resources(self):
         return self._resources
 
+    @property
+    def output_exemplar(self):
+        return self.transforms[-1].output_exemplar
+
     def __call__(self, batch):
         # batch is initially a list of elements, but after a BatchMapTransform
         # it can be a recordbatch, dict of lists, or list of dicts
@@ -196,7 +208,7 @@ def __call__(self, batch):
         return batch
 
 
-def dict_from_record_batch(b):
+def dict_from_record_batch(b) -> dict:
     # we follow the convention from hf batchencoding where homogeneous-lengthed arrays are turned into nd arrays
     # while heterogeneous lists are left as lists of arrays
 
diff --git a/src/levanter/data/_process_interleave.py b/src/levanter/data/_process_interleave.py
deleted file mode 100644
index b4586d130..000000000
--- a/src/levanter/data/_process_interleave.py
+++ /dev/null
@@ -1,338 +0,0 @@
-import asyncio
-import heapq
-from typing import Generic, Optional, Sequence, TypeVar
-
-import ray
-
-
-G = TypeVar("G")
-T = TypeVar("T")
-
-
-# this is what we want:
-# shards.permute().group(G).flatmap_interleaved(f, num_workers)  # produces an iterator over T
-
-
-# TODO: can we work with this?
-
-# def flatmap_interleaved(f, iterable, *, num_workers, ray_remote_args=None):
-#     """Apply f to each element of iterable, returning an interleaved list of results.
-#
-#     Args:
-#         f: A function to apply to each element of iterable. Should return an iterator
-#         iterable: An iterable of elements to apply f to.
-#         num_workers: The number of workers to use.
-#
-#     Returns:
-#         iterator over the results of applying f to each element of iterable, interleaving the results
-#     """
-#     iterable = list(enumerate(iterable))
-#     # group the elements by worker
-#     grouped = [iterable[i::num_workers] for i in range(num_workers)]
-#
-#     sink = RoundRobinSink.remote(range(len(iterable)))
-#
-#     results = [_compute_round_robin.options(**(ray_remote_args or {})).remote(f, group, sink) for group in grouped]
-#     ray.get(results)
-#
-#     return sink._buffer.drain()
-#
-#
-# @ray.remote
-# def _compute_round_robin(f, groups, sink):
-#     serials = [0] * len(groups)
-#     emitters = [(group_id, f(group)) for group_id, group in groups]
-#     done_emitters = set()
-#
-#     while len(done_emitters) < len(groups):
-#         for idx in range(len(groups)):
-#             group_id, emitter = emitters[idx]
-#             if group_id in done_emitters:
-#                 continue
-#             item = next(emitter, None)
-#             if item is None:
-#                 done_emitters.add(group_id)
-#                 emitters[idx] = (group_id, None)
-#                 del emitter
-#                 sink.group_total_known(group_id, serials[group_id])
-#             else:
-#                 sink.append_to_group(group_id, serials[group_id], item)
-#                 serials[group_id] += 1
-
-
-@ray.remote
-class RoundRobinSink:
-    def __init__(self, groups):
-        self._buffer = GroupRoundRobinBuffer(groups)
-
-    def append_to_group(self, group, item_serial, item):
-        self._buffer.append_to_group(group, item_serial, item)
-
-    def group_total_known(self, group, total):
-        self._buffer.group_total_known(group, total)
-
-
-class GroupRoundRobinBuffer(Generic[G, T]):
-    """
-    A buffer that holds items from multiple groups and returns them in a round-robin fashion.
-    The groups need not have the same number of items. If a group is exhausted, it is removed from the rotation.
-    """
-
-    def __init__(self, groups: Sequence[G]):
-        self.groups = list(groups)
-        self._current_group = 0
-        self.buffers: dict[G, list[tuple[int, T]]] = {group: [] for group in groups}
-        self._remaining_groups = set(groups)
-        self._totals_written: dict[G, int] = {group: 0 for group in groups}
-        self._totals_expected: dict[G, Optional[int]] = {group: None for group in groups}
-
-    def append_to_group(self, group: G, item_serial: int, item: T):
-        if group not in self.groups:
-            raise ValueError(f"Group {group} not in {self.groups}")
-
-        if group not in self._remaining_groups:
-            raise ValueError(f"Group {group} already finished")
-
-        heapq.heappush(self.buffers[group], (item_serial, item))
-
-    def group_total_known(self, group: G, total: int):
-        if group not in self.groups:
-            raise ValueError(f"Group {group} not in {self.groups}")
-
-        if group not in self._remaining_groups:
-            raise ValueError(f"Group {group} already finished: {total} vs {self._totals_expected[group]}")
-
-        self._totals_expected[group] = total
-
-        if self._totals_written[group] == total:
-            assert len(self.buffers[group]) == 0
-            self._remaining_groups.remove(group)
-
-    def is_finished(self):
-        return len(self._remaining_groups) == 0
-
-    def pop(self) -> Optional[T]:
-        group = self._next_group_to_read_from()
-        if group is None:
-            return None
-
-        if len(self.buffers[group]) == 0:
-            return None
-
-        cur_serial, item = self.buffers[group][0]
-
-        if cur_serial != self._totals_written[group]:
-            return None
-
-        heapq.heappop(self.buffers[group])
-
-        self._totals_written[group] += 1
-
-        if self._totals_written[group] == self._totals_expected[group]:
-            assert len(self.buffers[group]) == 0
-            assert group in self._remaining_groups
-            self._remaining_groups.remove(group)
-
-        self._current_group = (self._current_group + 1) % len(self.groups)
-
-        return item
-
-    def drain(self) -> list[T]:
-        items = []
-        while True:
-            item = self.pop()
-            if item is None:
-                break
-            items.append(item)
-
-        return items
-
-    def _next_group_to_read_from(self):
-        if len(self._remaining_groups) == 0:
-            return None
-
-        while True:
-            group = self.groups[self._current_group]
-            if group not in self._remaining_groups:
-                assert self._totals_written[group] == self._totals_expected[group]
-                assert len(self.buffers[group]) == 0
-                self._current_group = (self._current_group + 1) % len(self.groups)
-            else:
-                break
-        return group
-
-
-_SENTINEL = object()
-
-
-class _BoxedError:
-    def __init__(self, exc):
-        self.exc = exc
-
-    def __repr__(self):
-        return f"BoxedError({self.exc})"
-
-    def __str__(self):
-        return f"BoxedError({self.exc})"
-
-    def __eq__(self, other):
-        return isinstance(other, _BoxedError) and self.exc == other.exc
-
-    def __hash__(self):
-        return hash(self.exc)
-
-
-def _is_internal_item(item):
-    return item is _SENTINEL or isinstance(item, _BoxedError)
-
-
-class InProgressSequence(Generic[T]):
-    def __init__(self):
-        self._buffer: list = []
-        self._total_added = 0
-        self._promises: dict[int, asyncio.Future] = {}
-        self._finished_length: Optional[int] = None
-        self._finished_promise = asyncio.Future()
-
-    def append(self, item: T):
-        if self._finished_length is not None and len(self._buffer) >= self._finished_length:
-            raise IndexError("Index out of range")
-        self._buffer.append(item)
-        self._total_added += 1
-        self._fulfill_promise(len(self._buffer) - 1)
-
-    def to_list(self):
-        if not self.is_finished():
-            raise ValueError("Not finished")
-        return list(self._buffer)
-
-    def set_item(self, idx: int, item: T):
-        # self._buffer.append(item)
-        # return self._fulfill_promises()
-
-        if idx < 0:
-            raise IndexError("Negative indices not supported")
-
-        if self._finished_length is not None and idx >= self._finished_length:
-            raise IndexError("Index out of range")
-
-        if idx >= len(self._buffer):
-            self._buffer.extend([_SENTINEL] * (idx - len(self._buffer) + 1))
-
-        if self._buffer[idx] is _SENTINEL:
-            self._total_added += 1
-
-        self._buffer[idx] = item
-        self._fulfill_promise(idx)
-
-    def item_exception(self, idx: int, exc: Exception):
-        if idx < 0:
-            raise IndexError("Negative indices not supported")
-
-        if self._finished_length is not None and idx >= self._finished_length:
-            raise IndexError("Index out of range")
-
-        promise = self._promises.pop(idx, None)
-        if promise is not None:
-            promise.set_exception(exc)
-
-        if idx >= len(self._buffer):
-            self._buffer.extend([_SENTINEL] * (idx - len(self._buffer) + 1))
-
-        self._buffer[idx] = _BoxedError(exc)
-
-        self.set_exception(exc)
-
-    def set_finished_length(self, length):
-        if self._finished_length is not None:
-            raise ValueError("Finished length already set")
-        self._finished_length = length
-        return self._flush_promises()
-
-    def set_exception(self, exc: Exception):
-        if not self._finished_promise.done():
-            self._finished_promise.set_exception(exc)
-        for promise in self._promises.values():
-            promise.set_exception(exc)
-
-        self._promises.clear()
-
-    def is_finished(self):
-        return self._finished_length is not None and len(self._buffer) == self._finished_length
-
-    @property
-    def finished_promise(self):
-        return self._finished_promise
-
-    def final_length(self):
-        return self._finished_length
-
-    def current_length(self):
-        return len(self._buffer)
-
-    def get_promise(self, idx):
-        if idx < 0:
-            raise IndexError("Negative indices not supported")
-
-        if self._finished_length is not None and idx >= self._finished_length:
-            raise IndexError("Index out of range")
-
-        if self._finished_promise.done() and self._finished_promise.exception():
-            return self._finished_promise
-
-        if idx < len(self._buffer):
-            promise = asyncio.Future()
-            result = self._buffer[idx]
-            if isinstance(result, _BoxedError):
-                promise.set_exception(result.exc)
-                return promise
-            elif result is not _SENTINEL:
-                promise.set_result(result)
-                return promise
-
-        if idx in self._promises:
-            return self._promises[idx]
-
-        promise = asyncio.Future()
-        self._promises[idx] = promise
-        return promise
-
-    def finalize(self):
-        if self._finished_length is None:
-            self._finished_length = len(self._buffer)
-            self._flush_promises()
-
-        assert (
-            self._total_added == self._finished_length
-        ), f"Finalize called with {self._total_added} != {self._finished_length}"
-
-    async def get(self, idx):
-        if idx < len(self._buffer):
-            result = self._buffer[idx]
-            if isinstance(result, _BoxedError):
-                raise result.exc
-            elif result is not _SENTINEL:
-                return result
-
-        return await self.get_promise(idx)
-
-    def _fulfill_promise(self, idx):
-        promise = self._promises.pop(idx, None)
-        if promise is not None:
-            promise.set_result(self._buffer[idx])
-
-        if self._total_added == self._finished_length:
-            self._finished_promise.set_result(None)
-
-    def _flush_promises(self):
-        assert self._finished_length is not None
-
-        if self._total_added == self._finished_length:
-            self._finished_promise.set_result(None)
-
-        for idx, promise in self._promises.items():
-            if idx < self._finished_length:
-                if self._buffer[idx] is not _SENTINEL:
-                    promise.set_result(self._buffer[idx])
-            else:
-                promise.set_exception(IndexError("Index out of range"))
diff --git a/src/levanter/data/_prp.py b/src/levanter/data/_prp.py
new file mode 100644
index 000000000..65a86e66f
--- /dev/null
+++ b/src/levanter/data/_prp.py
@@ -0,0 +1,63 @@
+import typing
+
+import jax.lax
+import jax.numpy as jnp
+import jax.random as jrandom
+import numpy as np
+
+
+# TODO: do we make this a pytree
+class Permutation:
+    # Pseudo-Random Permutation Code
+    """A stateless pseudo-random permutation.
+
+    This class generates a pseudo-random permutation of a given length. The permutation is generated using a PRNG
+    with a fixed key. The permutation is generated by finding a random `a` and `b` such that `gcd(a, length) != 1` and
+    then computing the permutation as `p(x) = (a * x + b) % length`.
+
+    This is not a very good PRP, but it is probably good enough for our purposes.
+    """
+    # TODO: is it actually good enough for our purposes?
+
+    def __init__(self, length, prng_key):
+        self.length = length
+        self.prng_key = prng_key
+        a_key, b_key = jrandom.split(prng_key)
+        self._a = jrandom.randint(a_key, (), 1, length)
+        self._b = jrandom.randint(b_key, (), 0, length)
+
+        cond = lambda a_and_key: jnp.all(jnp.gcd(a_and_key[0], length) != 1)
+
+        def loop_body(a_and_key):
+            a, key = a_and_key
+            this_key, key = jrandom.split(key)
+            a = jrandom.randint(this_key, (), 1, length)
+            return a, key
+
+        self._a, key = jax.lax.while_loop(cond, loop_body, (self._a, a_key))
+
+        self._a = int(self._a)
+        self._b = int(self._b)
+
+    @typing.overload
+    def __call__(self, indices: int) -> int:
+        ...
+
+    @typing.overload
+    def __call__(self, indices: jnp.ndarray) -> jnp.ndarray:
+        ...
+
+    def __call__(self, indices):
+        if isinstance(indices, jnp.ndarray):
+            # TODO: use error_if?
+            # import equinox as eqx
+            if jnp.any(indices < 0) or jnp.any(indices >= self.length):
+                raise IndexError(f"index {indices} is out of bounds for length {self.length}")
+        elif isinstance(indices, np.ndarray):
+            if np.any(indices < 0) or np.any(indices >= self.length):
+                raise IndexError(f"index {indices} is out of bounds for length {self.length}")
+        else:
+            if indices < 0 or indices >= self.length:
+                raise IndexError(f"index {indices} is out of bounds for length {self.length}")
+
+        return (self._a * indices + self._b) % self.length
diff --git a/src/levanter/data/_queue.py b/src/levanter/data/_queue.py
index b29327a83..fd8f84860 100644
--- a/src/levanter/data/_queue.py
+++ b/src/levanter/data/_queue.py
@@ -8,18 +8,18 @@
 from queue import PriorityQueue
 from typing import List, Optional, Protocol, Sequence, TypeVar
 
-import pyarrow as pa
 import ray
 from ray.actor import ActorHandle
 
 from levanter.utils.ray_utils import RefBox
 
-from ._preprocessor import BatchProcessor, as_record_batch
+from ._preprocessor import BatchProcessor
 
 
 logger = pylogging.getLogger(__name__)
 
 T = TypeVar("T")
+U = TypeVar("U")
 LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 
 
@@ -65,28 +65,20 @@ def __le__(self, other: "PriorityWorkItem"):
             return self.priority <= other.priority
 
 
-def _mk_queue_aware_process_task(processor: BatchProcessor[T], queue: ActorHandle):
+def _mk_queue_aware_process_task(processor: BatchProcessor[T, U], queue: ActorHandle):
     @ray.remote(num_cpus=processor.num_cpus, num_gpus=processor.num_gpus, resources=processor.resources)
-    def process_task(desc, batch: List[T]) -> pa.RecordBatch:
-        pylogging.basicConfig(level=pylogging.INFO, format=LOG_FORMAT)
+    def process_task(desc, batch: List[T]):
+        # pylogging.basicConfig(level=DEFAULT_LOG_LEVEL, format=LOG_FORMAT)
         logger.debug(f"Processing batch {desc}")
         queue.task_running.remote()
-        # timer_thread = WaitTimeReportingThread(
-        #     lambda t: logger.info(f"Waiting for {desc} to be processed for {t} seconds"), interval=30
-        # )
-        # timer_thread.start()
         try:
             result = processor(batch)
-            del batch
-            result = as_record_batch(result)
             logger.debug(f"Finished processing batch {desc}")
             return result
         except Exception as e:
             logger.exception(f"Error while processing batch {desc}")
             raise e
         finally:
-            # timer_thread.shutdown()
-            # timer_thread.join()
             pass
 
     return process_task
@@ -120,7 +112,7 @@ class _BatchProcessorQueue:  # (Generic[T]): ray doesn't like generics
     def batch_size(self):
         return self.processor.batch_size
 
-    def __init__(self, batch_processor: BatchProcessor[T]):
+    def __init__(self, batch_processor: BatchProcessor[T, U]):
         self.pqueue = PriorityQueue()
         self.processor = batch_processor
         self._next_task_id = 0
@@ -145,7 +137,10 @@ def _maybe_start_task(self):
             self.ready = False
             item = self.pqueue.get()
             batch = item.batch
-            item.task_future.set_result(self._task_processor.remote(item.desc, batch))
+            try:
+                item.task_future.set_result(self._task_processor.remote(item.desc, batch))
+            except Exception as e:
+                item.task_future.set_exception(e)
 
     def task_running(self):
         self.ready = True
@@ -153,7 +148,7 @@ def task_running(self):
 
 
 @ray.remote(num_cpus=0.5, scheduling_strategy="SPREAD")
-class PriorityProcessorActor:
+class WorkQueueDispatcherActor:
     def __init__(self, max_in_flight: Optional[int] = 200):
         pylogging.basicConfig(level=pylogging.INFO, format=LOG_FORMAT)
         self._queue: list[PriorityWorkItem] = []  # heapq
@@ -162,9 +157,17 @@ def __init__(self, max_in_flight: Optional[int] = 200):
         self._current_item: Optional[PriorityWorkItem] = None
         self._max_in_flight = max_in_flight
 
+        self._max_priority: Optional[float] = None
         self._processing_thread = threading.Thread(target=self._loop, daemon=True)
         self._processing_thread.start()
 
+    def set_max_dispatch_priority(self, max_priority: Optional[float]):
+        """
+        When the sink is full, we will not dispatch items with a priority higher than this.
+        """
+        with self._queue_lock:
+            self._max_priority = max_priority
+
     def assign_work(self, group: PriorityWorkTaskGroupSpec):
         items = group.build().items()
         with self._queue_lock:
@@ -196,7 +199,7 @@ def shutdown(self):
             if self._processing_thread.is_alive():
                 self._processing_thread.join()
 
-    def _loop(self: "PriorityProcessorActor"):
+    def _loop(self: "WorkQueueDispatcherActor"):
         should_sleep = False
         backpressure_queue: list[ray.ObjectRef] = []
 
@@ -220,6 +223,10 @@ def drain_backpressure_to(count):
                     should_sleep = False
 
                 item = heapq.heappop(self._queue)
+                if self._max_priority is not None and item.priority > self._max_priority:
+                    logger.debug(f"Item {item.name} has priority {item.priority} which is too high. Rescheduling.")
+                    heapq.heappush(self._queue, item)
+                    continue
                 self._current_item = item
 
             try:
diff --git a/src/levanter/data/audio.py b/src/levanter/data/audio.py
index 9a1f98d93..d04479a24 100644
--- a/src/levanter/data/audio.py
+++ b/src/levanter/data/audio.py
@@ -19,17 +19,18 @@
 from haliax import Axis
 
 from levanter.compat.hf_checkpoints import load_processor, load_tokenizer
-from levanter.data._preprocessor import BatchProcessor, dict_from_record_batch
-from levanter.data.dataset import ShardableDataset
+from levanter.data import AsyncDataset
+from levanter.data._preprocessor import BatchProcessor
+from levanter.data.dataset import MappedAsyncDataset
 from levanter.data.metrics_monitor import LoggerMetricsMonitor, LoggingMetricsMonitor, MetricsMonitor
-from levanter.data.shard_cache import DEFAULT_ROWS_PER_CHUNK, ShardCache, build_or_load_cache
-from levanter.data.sharded_dataset import AudioTextUrlDataset, ShardedDataset, WrappedHFDataset
+from levanter.data.sharded_datasource import AudioTextUrlDataSource, ShardedDataSource, WrappedHFDataSource
 from levanter.data.text import BatchTokenizer
 
 # intercept the logging nonsense here
 from levanter.logging import silence_transformer_nag
 from levanter.models.asr_model import AudioTextExample
-from levanter.utils.jax_utils import use_cpu_device
+from levanter.store.cache import TreeCache, build_or_load_cache
+from levanter.utils.jax_utils import local_cpu_mesh
 
 
 silence_transformer_nag()  # noqa
@@ -44,15 +45,6 @@
 
 logger = logging.getLogger("levanter.data.audio")
 
-AudioTextStorageBatch = TypedDict(
-    "AudioTextStorageBatch",
-    {
-        "input_features": np.ndarray,
-        "input_ids": np.ndarray,
-        "attention_mask": np.ndarray,
-        "audio_shape": Sequence[Tuple[int, int]],
-    },
-)
 AudioTextDict = TypedDict(
     "AudioTextDict",
     {
@@ -62,8 +54,14 @@
     },
 )
 
+AudioTextDict_exemplar = {
+    "input_features": np.zeros((1, 1), dtype=np.float32),
+    "input_ids": np.zeros((0,), dtype=np.int32),
+    "attention_mask": np.zeros((0,), dtype=np.int32),
+}
 
-class BatchAudioProcessor(BatchProcessor[Tuple[np.ndarray, int, str]]):
+
+class BatchAudioProcessor(BatchProcessor[Tuple[np.ndarray, int, str], AudioTextDict]):
     """
     A batch processor that converts raw audio into the expected inputs of a model.
     """
@@ -81,7 +79,7 @@ def __init__(
         padding=True,
     ):
         self.feature_extractor: SequenceFeatureExtractor = processor.feature_extractor
-        self.bt: PreTrainedTokenizerBase = BatchTokenizer(
+        self.bt = BatchTokenizer(
             tokenizer,
             enforce_bos=enforce_bos,
             enforce_eos=enforce_eos,
@@ -95,7 +93,7 @@ def __init__(
         self.override_resources = override_resources
         self._batch_size = batch_size
 
-    def __call__(self, batch: Sequence[Tuple[np.ndarray, int, str]]) -> AudioTextStorageBatch:
+    def __call__(self, batch: Sequence[Tuple[np.ndarray, int, str]]) -> Sequence[AudioTextDict]:
         """
         Process a batch of data.
         """
@@ -106,15 +104,28 @@ def __call__(self, batch: Sequence[Tuple[np.ndarray, int, str]]) -> AudioTextSto
         uniq_sampling_rates: set[int] = set(sampling_rates)
         assert len(uniq_sampling_rates) == 1, "Sampling rates should be standardized"
         audio_features: BatchFeature = self.feature_extractor(audio_batch, sampling_rate=uniq_sampling_rates.pop())
-        text_features: BatchEncoding = self.bt(text_batch)
-        combined_features = audio_features | text_features
-        combined_features["input_ids"] = np.array(combined_features["input_ids"])
-        combined_features["attention_mask"] = np.array(combined_features["attention_mask"])
-        a_features = np.array(combined_features["input_features"])
-        a_shape = a_features.shape
-        combined_features["audio_shape"] = [a_shape[1:]] * a_shape[0]
-        combined_features["input_features"] = a_features.reshape(a_shape[0], -1)
-        return combined_features
+        audio_features["input_features"] = np.array(audio_features["input_features"])
+        text_features: list[dict] = self.bt(text_batch)
+        text_features = [
+            {k: np.array(tf[k], dtype=np.int32) for k in ["input_ids", "attention_mask"]} for tf in text_features
+        ]
+
+        # debatch and return
+        out = []
+        for i, text in enumerate(text_features):
+            out.append(
+                {
+                    "input_features": audio_features["input_features"][i],
+                    "input_ids": text["input_ids"],
+                    "attention_mask": text["attention_mask"],
+                }
+            )
+
+        return out  # type: ignore
+
+    @property
+    def output_exemplar(self):
+        return AudioTextDict_exemplar
 
     @property
     def num_cpus(self) -> int:
@@ -146,10 +157,10 @@ class AudioDatasetSourceConfig:
     train_urls: List[str] = ()  # type: ignore
     validation_urls: List[str] = ()  # type:ignore
 
-    def get_shard_source(self, split) -> Optional[ShardedDataset[Tuple[np.ndarray, int, str]]]:
+    def get_shard_source(self, split) -> Optional[ShardedDataSource[Tuple[np.ndarray, int, str]]]:
         if self.id is not None:
             try:
-                ds = WrappedHFDataset(self.id, split=split, name=self.name, streaming=self.stream)
+                ds = WrappedHFDataSource(self.id, split=split, name=self.name, streaming=self.stream)
             except ValueError as e:
                 # if the message starts with Bad split, then just return None
                 if str(e).startswith("Bad split"):
@@ -164,7 +175,7 @@ def get_shard_source(self, split) -> Optional[ShardedDataset[Tuple[np.ndarray, i
             def decode(x):
                 text = x[self.text_key]
                 audio_pointer = x[self.audio_key]
-                audio = AudioTextUrlDataset.resolve_audio_pointer(audio_pointer, self.sampling_rate)
+                audio = AudioTextUrlDataSource.resolve_audio_pointer(audio_pointer, self.sampling_rate)
                 return (audio["array"], audio["sampling_rate"], text)
 
             return ds.map(decode)
@@ -172,7 +183,7 @@ def decode(x):
             split_urls = self.urls_for_split(split)
             if len(split_urls) == 0:
                 return None
-            return AudioTextUrlDataset(split_urls, self.text_key, self.audio_key, sampling_rate=self.sampling_rate)
+            return AudioTextUrlDataSource(split_urls, self.text_key, self.audio_key, sampling_rate=self.sampling_rate)
 
     def doc_iterator(self, split: str) -> Iterator[Tuple[np.ndarray, int, str]]:
         if self.id is not None:
@@ -182,7 +193,7 @@ def doc_iterator(self, split: str) -> Iterator[Tuple[np.ndarray, int, str]]:
         else:
             urls = self.urls_for_split(split)
 
-            yield from AudioTextUrlDataset(urls, self.text_key, self.audio_key, sampling_rate=self.sampling_rate)
+            yield from AudioTextUrlDataSource(urls, self.text_key, self.audio_key, sampling_rate=self.sampling_rate)
 
     def urls_for_split(self, split):
         if split == "train":
@@ -211,7 +222,6 @@ class AudioTaskConfig(abc.ABC):
     train_split: str = "train"
     validation_split: str = "validation"
     cache_dir: str = "cache/"
-    rows_per_chunk: int = DEFAULT_ROWS_PER_CHUNK  # number of rows to process and cache per chunk
     enforce_bos: bool = True  # whether to append bos even if the tokenizer doesn't
     enforce_eos: bool = True  # whether to append eos even if the tokenizer doesn't
     max_length: int = 448
@@ -232,64 +242,72 @@ def the_tokenizer(self) -> PreTrainedTokenizerBase:
             return load_tokenizer(self.tokenizer)
 
     @cached_property
-    def the_feature_extractor(self) -> PreTrainedTokenizerBase:
+    def the_feature_extractor(self) -> SequenceFeatureExtractor:
         return self.the_processor.feature_extractor
 
     @abc.abstractmethod
     def train_set(
         self, batch_size: int, monitors: Union[bool, List[MetricsMonitor]] = True
-    ) -> ShardableDataset[np.ndarray]:
+    ) -> AsyncDataset[np.ndarray]:
         pass
 
     @abc.abstractmethod
     def validation_sets(
-        self, batch_size: int, monitors: Union[bool, List[MetricsMonitor]] = True
-    ) -> Mapping[str, ShardableDataset[np.ndarray]]:
+        self, monitors: Union[bool, List[MetricsMonitor]] = True
+    ) -> Mapping[str, AsyncDataset[np.ndarray]]:
         pass
 
 
-class ProcessedAudioCache(ShardableDataset[AudioTextStorageBatch]):
+class ProcessedAudioCache(AsyncDataset[AudioTextDict]):
     """
     Represents a cache of data with both pre-processed audio and tokenized text, which is a directory of parquet files with a ledger file.
     """
 
-    def __init__(self, chunk_cache: ShardCache):
-        # Separates Batching For Processing from Batching For Training
-        self.chunk_cache = chunk_cache.with_batch_size(1)
+    def __init__(self, cache: TreeCache[AudioTextDict]):
+        self.cache = cache
+
+    async def async_len(self) -> int:
+        return await self.cache.async_len()
+
+    async def final_length_is_known(self) -> bool:
+        return await self.cache.final_length_is_known()
+
+    def is_finite(self) -> bool:
+        return self.cache.is_finite()
 
-    def __iter__(self):
-        for batch in self._chunks():
-            unarrow = dict_from_record_batch(batch)
-            # Flatten Singleton Batch Dimension
-            singleton_dict = {key: unarrow[key].squeeze() for key in unarrow}
-            singleton_dict["input_features"] = singleton_dict["input_features"].reshape(singleton_dict["audio_shape"])
-            del singleton_dict["audio_shape"]
-            yield singleton_dict
+    async def current_len(self) -> Optional[int]:
+        return await self.cache.current_len()
 
-    def _chunks(self):
-        return self.chunk_cache.iter_batches_from_chunks()
+    async def get_batch(self, indices: Sequence[int]) -> Sequence[AudioTextDict]:
+        return await self.cache.get_batch(indices)
+
+    # def _convert_to_example(self, storage: AudioTextStorageBatch) -> AudioTextDict:
+    #     storage["input_features"] = storage["input_features"].reshape(storage["audio_shape"])
+    #     del storage["audio_shape"]
+    #     return storage
 
     @staticmethod
     def build_or_load(
         cache_dir: str,
-        source: ShardedDataset[Tuple[np.ndarray, int, str]],
+        source: ShardedDataSource[Tuple[np.ndarray, int, str]],
         processor: ProcessorMixin,
         tokenizer: PreTrainedTokenizerBase,
         enforce_bos=True,
         enforce_eos=True,
         batch_size=128,
-        rows_per_chunk=DEFAULT_ROWS_PER_CHUNK,
         monitors=None,
         await_finished=True,
         override_resources=None,
+        max_length=448,
     ) -> "ProcessedAudioCache":
-        bp: BatchProcessor[Tuple[np.ndarray, int, str]] = BatchAudioProcessor(
+        bp = BatchAudioProcessor(
             processor,
             tokenizer,
             enforce_bos=enforce_bos,
             enforce_eos=enforce_eos,
             batch_size=batch_size,
             override_resources=override_resources,
+            max_length=max_length,
         )
         monitors = monitors or []
         cache = build_or_load_cache(
@@ -297,8 +315,6 @@ def build_or_load(
             source,
             bp,
             await_finished=await_finished,
-            batch_size=batch_size,
-            rows_per_chunk=rows_per_chunk,
             monitors=monitors,
         )
         if cache.is_finished:
@@ -311,9 +327,9 @@ def build_or_load(
         return ProcessedAudioCache(cache)
 
     @staticmethod
-    def load(cache_dir, batch_size: int = 128):
+    def load(cache_dir):
         """
-        Load a TokenizedDocumentCache from a directory. If the ledger file is not present, this will raise a
+        Load a ProcessedAudioCache from a directory. If the ledger file is not present, this will raise a
         FileNotFoundError.
 
         NOTE: ATM this attempts to migrate old caches to the new format, but this will be removed in the future.
@@ -323,7 +339,7 @@ def load(cache_dir, batch_size: int = 128):
         """
 
         try:
-            cache = ShardCache.load(cache_dir, batch_size=batch_size)
+            cache = TreeCache.load(cache_dir, AudioTextDict_exemplar)
             return ProcessedAudioCache(cache)
         except FileNotFoundError:
             raise FileNotFoundError(f"{cache_dir} is not a complete cache")
@@ -331,15 +347,6 @@ def load(cache_dir, batch_size: int = 128):
             logger.exception("error loading cache")
             raise
 
-    def shard(self, shard_index, num_shards):
-        if num_shards <= shard_index:
-            raise ValueError(f"Shard index {shard_index} is out of range")
-
-        if num_shards == 1:
-            return self
-
-        return ProcessedAudioCache(self.chunk_cache.shard(shard_index, num_shards))
-
 
 @dataclass
 class AudioIODatasetConfig(AudioDatasetSourceConfig, AudioTaskConfig):
@@ -351,16 +358,12 @@ def train_set(self, batch_size: int, monitors: Union[bool, List[MetricsMonitor]]
             raise ValueError("No training set!")
         return ds
 
-    def validation_set(
-        self, batch_size: int, monitors: Union[bool, List[MetricsMonitor]] = True
-    ) -> Optional[ProcessedAudioCache]:
-        return self.build_or_load_cache(self.validation_split, batch_size=batch_size, monitors=monitors)
+    def validation_set(self, monitors: Union[bool, List[MetricsMonitor]] = True) -> Optional[ProcessedAudioCache]:
+        return self.build_or_load_cache(self.validation_split, monitors=monitors)
 
-    def validation_sets(
-        self, batch_size: int, monitors: Union[bool, List[MetricsMonitor]] = True
-    ) -> Mapping[str, ProcessedAudioCache]:
+    def validation_sets(self, monitors: Union[bool, List[MetricsMonitor]] = True) -> Mapping[str, ProcessedAudioCache]:
         if self._has_validation_set:
-            validation_set = self.validation_set(batch_size, monitors)
+            validation_set = self.validation_set(monitors)
             if validation_set is not None:
                 return {"": validation_set}
         return {}
@@ -393,7 +396,7 @@ def build_or_load_cache(
         name = logger_name or os.path.basename(self.cache_dir)
 
         try:
-            return ProcessedAudioCache.load(split_cache_dir, batch_size=batch_size)
+            return ProcessedAudioCache.load(split_cache_dir)
         except FileNotFoundError:
             pass
 
@@ -420,16 +423,16 @@ def build_or_load_cache(
             enforce_bos=self.enforce_bos,
             enforce_eos=self.enforce_eos,
             batch_size=batch_size,
-            rows_per_chunk=self.rows_per_chunk,
             monitors=monitors,
             await_finished=(split == "validation"),
+            max_length=self.max_length,
         )
 
 
-class AudioTextDataset(ShardableDataset[AudioTextExample]):
+class AudioTextDataset(MappedAsyncDataset[AudioTextDict, AudioTextExample]):
     def __init__(
         self,
-        dataset: ShardableDataset[AudioTextStorageBatch],
+        dataset: AsyncDataset[AudioTextDict],
         TextPos: Axis,
         AudioPos: hax.AxisSelector,
         KPos: Axis,
@@ -443,28 +446,23 @@ def __init__(
         self.key = key
         self.ignore_id = ignore_index
 
-    def shard(self, shard_id: int, num_shards: int) -> "AudioTextDataset":
-        return AudioTextDataset(
-            self.dataset.shard(shard_id, num_shards),
-            self.TextPos,
-            self.AudioPos,
-            self.KPos,
-            self.key,
-            self.ignore_id,
-        )
-
-    def __iter__(self) -> Iterator[AudioTextExample]:
         sharding = jax.sharding.SingleDeviceSharding(jax.local_devices(backend="cpu")[0])
 
-        with use_cpu_device():
-
-            @functools.partial(eqx.filter_jit, out_shardings=sharding)
-            def _convert_example(inputs: AudioTextDict) -> "AudioTextExample":
+        @functools.partial(eqx.filter_jit, out_shardings=sharding)
+        def _convert_example(inputs: AudioTextDict) -> "AudioTextExample":
+            with local_cpu_mesh():
                 tokens = hax.named(inputs["input_ids"], self.TextPos)
                 audio_features = hax.named(inputs["input_features"], self.AudioPos)
-
                 return AudioTextExample.init(audio_features, tokens, ignore_id=self.ignore_id)
 
-            for example in self.dataset:
-                converted_example = _convert_example(example)
-                yield converted_example
+        super().__init__(self.dataset, _convert_example)
+
+    # def __iter__(self) -> Iterator[AudioTextExample]:
+    #
+    #
+    #     with use_cpu_device():
+    #
+    #
+    #         for example in self.dataset:
+    #             converted_example = _convert_example(example)
+    #             yield converted_example
diff --git a/src/levanter/data/dataset.py b/src/levanter/data/dataset.py
index 14c8979b3..def0c158a 100644
--- a/src/levanter/data/dataset.py
+++ b/src/levanter/data/dataset.py
@@ -1,66 +1,356 @@
-from abc import ABC, abstractmethod
-from typing import Iterable, Iterator, List, TypeVar
+import abc
+import asyncio
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from typing import Callable, Generic, Optional, Sequence, TypeVar
 
-import jax.random as jrandom
+import jax.random
+import numpy as np
 from jax.random import PRNGKey
 
+from levanter.utils import thread_utils
 
-T = TypeVar("T", covariant=True)
 
+logger = logging.getLogger(__name__)
 
-class Dataset(Iterable[T], ABC):
-    @abstractmethod
-    def __iter__(self) -> Iterator[T]:
+
+T_co = TypeVar("T_co", covariant=True)
+T = TypeVar("T")
+U = TypeVar("U")
+
+
+_executor = ThreadPoolExecutor(max_workers=10)
+
+
+class DatasetBase(abc.ABC, Generic[T_co]):
+    """
+    Base class for sync and async datasets. This class is not meant to be used directly.
+    """
+
+    @abc.abstractmethod
+    def as_async_dataset(self) -> "AsyncDataset[T_co]":
+        raise NotImplementedError("...")
+
+    @abc.abstractmethod
+    def as_sync_dataset(self) -> "SyncDataset[T_co]":
+        raise NotImplementedError("...")
+
+
+class AsyncDataset(DatasetBase[T_co]):
+    """
+    An asynchronous dataset that can be used with async/await syntax. In Levanter, we use AsyncDataset for two purposes:
+    * To represent datasets that are inherently asynchronous (e.g. reading from disk, network, etc.).
+    * To represent datasets that are still being constructed.
+
+    The core methods in this class are:
+    * `async_len`: Returns the final length of the dataset.
+    * `get_batch`: Returns a batch of items from the dataset.
+    * `current_len`: Returns the current length of the dataset. This may be None if no current length is known.
+    """
+
+    @abc.abstractmethod
+    async def async_len(self) -> int:
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    async def final_length_is_known(self) -> bool:
+        """Returns whether the final length of the dataset is known.
+        If this returns False, the current_len of the dataset may change in the future."""
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def is_finite(self) -> bool:
+        """
+        Returns whether the dataset will have a known length in the future (e.g. if it's being constructed).
+        If this returns False, the length of the dataset is infinite or unknowable.
+        """
         raise NotImplementedError
 
+    @abc.abstractmethod
+    async def current_len(self) -> Optional[int]:
+        """
+        Returns the current length of the dataset that won't require (expensive) waiting.
 
-class ShardableDataset(Dataset[T], ABC):
-    @abstractmethod
-    def shard(self, shard_id: int, num_shards: int) -> "ShardableDataset[T]":
+        If the current length is not known, returns None. This might block temporarily for a short time to get the
+        current length.
+        """
         raise NotImplementedError
 
-    @abstractmethod
-    def __iter__(self) -> Iterator[T]:
+    async def getitem_async(self, index: int) -> T_co:
+        """
+        Returns the item at the given index. Typically implemented as a wrapper around `get_batch`.
+
+        In general, it is better to call (and override) `get_batch` instead of this method.
+        """
+        return (await self.get_batch([index]))[0]
+
+    @abc.abstractmethod
+    async def get_batch(self, indices: Sequence[int]) -> Sequence[T_co]:
         raise NotImplementedError
 
+    async def wait_until_len_at_least(self, length: int) -> int:
+        """
+        Returns the length of the dataset once it is at least `length` or if the dataset has a known (finished) length.
+
+        The default implementation is a naive busy-wait loop. You should override this method for more efficient
+        implementations.
+        """
+        return await naive_busy_wait_until_len_at_least(self, length)
+
+    def as_sync_dataset(self):
+        return SyncifiedDataset(self)
+
+    def as_async_dataset(self) -> "AsyncDataset[T_co]":
+        return self
+
+    def map(self, fn: Callable[[T_co], U], *extra_args, **extra_kwargs) -> "MappedAsyncDataset[T_co, U]":
+        return MappedAsyncDataset(self, fn, *extra_args, **extra_kwargs)
+
+    def shuffle(self, key: PRNGKey):
+        import levanter.data.permutation as permutation
+
+        return permutation.PermutationDataset(self, key)
+
+    def era_shuffle(self, era_length: int, key: PRNGKey):
+        import levanter.data.permutation as permutation
+
+        return permutation.EraShufflingDataset(self, era_length, key=key)
+
+
+async def naive_busy_wait_until_len_at_least(dataset: AsyncDataset[T_co], length: int) -> int:
+    """
+    Runs a busy-wait loop until the dataset has at least `length` items or the final length is known.
+
+    Returns the current length of the dataset when either the dataset has at least `length` items or the final length is
+    known.
+
+    You should probably implement this in a more efficient way. This is just a naive implementation.
+    """
+    while not await dataset.final_length_is_known():
+        current_len = await dataset.current_len()
+        if current_len is None:
+            raise ValueError("Dataset has unknown length")
+        if current_len <= length:
+            await asyncio.sleep(0.1)
+        else:
+            return current_len
+
+    return await dataset.async_len()
+
+
+class SyncDataset(DatasetBase[T_co]):
+    """
+    A synchronous dataset that can be used with regular Python syntax. In Levanter, we mainly do not use this class.
+    You can use this class if it's easier, then convert it to an AsyncDataset using `as_async_dataset`. This
+    is not as efficient as using an AsyncDataset directly, but it can be useful for testing or for simpler code.
+    """
+
+    @abc.abstractmethod
+    def __len__(self) -> int:
+        """
+        Returns the final length of the data store.
+        May raise if the length is not known.
+        """
+
+    @abc.abstractmethod
+    def has_len(self) -> bool:
+        """
+        Whether the data store currently has a known length. If this returns False, then the length of the data store
+        may change in the future.
+        """
+        pass
+
+    @abc.abstractmethod
+    def current_len(self) -> Optional[int]:
+        """
+        Returns the current length of the data store. If the length is infinite or not known, returns None.
+        """
+        pass
+
+    def __getitem__(self, index: int) -> T_co:
+        return self.get_batch([index])[0]
+
+    @abc.abstractmethod
+    def get_batch(self, indices: Sequence[int] | np.ndarray) -> Sequence[T_co]:
+        pass
+
+    def as_async_dataset(self) -> "AsyncDataset[T_co]":
+        return AsyncifiedDataset(self)
+
+    def as_sync_dataset(self) -> "SyncDataset[T_co]":
+        return self
+
 
-class InMemoryDataset(ShardableDataset[T]):
-    def __init__(self, items: List[T]):
-        self.items = items
+class SyncifiedDataset(SyncDataset[T_co]):
+    def __init__(self, dataset: AsyncDataset[T_co]):
+        self.dataset = dataset
+
+    def _run_coroutine(self, coro):
+        return thread_utils.blocking_wait(coro)
+
+    def __len__(self) -> int:
+        return self._run_coroutine(self.dataset.async_len())
+
+    def has_len(self) -> bool:
+        return self.dataset.is_finite()
+
+    def current_len(self) -> Optional[int]:
+        return self._run_coroutine(self.dataset.current_len())
 
-    def __iter__(self) -> Iterator[T]:
-        return iter(self.items)
+    def get_batch(self, indices: Sequence[int] | np.ndarray) -> Sequence[T_co]:
+        return self._run_coroutine(self.dataset.get_batch(indices))
 
-    def shard(self, shard_id: int, num_shards: int) -> "InMemoryDataset[T]":
-        return InMemoryDataset(self.items[shard_id::num_shards])
+    def __getitem__(self, index: int) -> T_co:
+        return self._run_coroutine(self.dataset.getitem_async(index))
 
 
-class ShuffleDataset(ShardableDataset[T]):
-    def __init__(self, dataset: Dataset[T], key: PRNGKey, buffer_size: int):
+class AsyncifiedDataset(AsyncDataset[T_co]):
+    def __init__(self, dataset: SyncDataset[T_co]):
         self.dataset = dataset
-        self.buffer_size = buffer_size
-        self.key = key
-
-    def shard(self, shard_id: int, num_shards: int) -> "ShuffleDataset":
-        key = jrandom.fold_in(self.key, shard_id)
-        return ShuffleDataset(self.dataset.shard(shard_id, num_shards), key, self.buffer_size)  # type: ignore
-
-    def __iter__(self) -> Iterator[T]:
-        inner = iter(self.dataset)
-        buffer: List[T] = []
-        current_key = self.key
-
-        for item in inner:
-            if len(buffer) == self.buffer_size:
-                current_key, subkey = jrandom.split(current_key)
-                i = jrandom.randint(subkey, (), 0, len(buffer))
-                yield buffer[i]
-                buffer[i] = item
+
+    async def async_len(self) -> int:
+        return len(self.dataset)
+
+    async def final_length_is_known(self) -> bool:
+        return self.dataset.has_len()
+
+    def is_finite(self) -> bool:
+        return self.dataset.has_len()
+
+    async def current_len(self) -> Optional[int]:
+        return self.dataset.current_len()
+
+    async def get_batch(self, indices: Sequence[int]) -> Sequence[T_co]:
+        return self.dataset.get_batch(indices)
+
+    async def getitem_async(self, index: int) -> T_co:
+        return self.dataset[index]
+
+    def __repr__(self):
+        return f"WrappedAsyncDataset({repr(self.dataset)})"
+
+    def __str__(self):
+        return f"WrappedAsyncDataset({str(self.dataset)})"
+
+
+class ListAsyncDataset(AsyncDataset[T]):
+    """
+    A simple dataset that wraps a list. Mostly for testing.
+    """
+
+    def __init__(self, data: list[T], is_complete: bool = False):
+        self.data = data
+        self.is_complete = is_complete
+        if not is_complete:
+            self.complete_promise: Optional[asyncio.Future[None]] = asyncio.Future()
+            self.length_updated: Optional[asyncio.Condition] = asyncio.Condition()
+        else:
+            self.complete_promise = None
+            self.length_updated = None
+
+    async def async_len(self) -> int:
+        # this is the final length
+        if not self.is_complete:
+            assert self.complete_promise is not None
+            await self.complete_promise
+        return len(self.data)
+
+    async def final_length_is_known(self) -> bool:
+        return self.is_complete
+
+    def is_finite(self) -> bool:
+        return True
+
+    async def current_len(self) -> Optional[int]:
+        return len(self.data)
+
+    async def get_batch(self, indices: Sequence[int]) -> Sequence[T]:
+        await self.wait_until_len_at_least(max(indices) + 1)
+        return [self.data[i] for i in indices]
+
+    def append(self, item: T):
+        if self.is_complete:
+            raise ValueError("Cannot append to a finalized dataset")
+        self.data.append(item)
+        asyncio.create_task(self.notify_length_update())
+
+    def finalize(self):
+        self.is_complete = True
+        if self.complete_promise is not None:
+            self.complete_promise.set_result(None)
+            if not asyncio.get_event_loop().is_running():
+                _executor.submit(lambda: asyncio.run(self.notify_length_update()))
             else:
-                buffer.append(item)
+                asyncio.create_task(self.notify_length_update())
+
+    async def notify_length_update(self):
+        async with self.length_updated:
+            self.length_updated.notify_all()
+
+    async def wait_until_len_at_least(self, length: int) -> int:
+        if self.is_complete:
+            return len(self.data)
+
+        assert self.length_updated is not None
+
+        async with self.length_updated:
+            while len(self.data) < length and not self.is_complete:
+                await self.length_updated.wait()
+
+        return len(self.data)
+
+
+class MappedAsyncDataset(AsyncDataset[U], Generic[T, U]):
+    """
+    A dataset that applies a function to each item in the dataset.
+    You can pass extra arguments to the function using `*extra_args` and `**extra_kwargs`.
+    If a kwarg called `key` is passed, it will be treated as a PRNGKey and folded in with the index of the item
+    for each call to the function.
+    """
+
+    def __init__(
+        self,
+        dataset: AsyncDataset[T],
+        fn: Callable[[T], U] | Callable[[T, Optional[PRNGKey]], U],
+        *extra_args,
+        **extra_kwargs,
+    ):
+        self.dataset = dataset
+        self.fn = fn
+        self._extra_args = extra_args
+        self._extra_kwargs = extra_kwargs
+
+    async def async_len(self) -> int:
+        return await self.dataset.async_len()
+
+    async def final_length_is_known(self) -> bool:
+        return await self.dataset.final_length_is_known()
+
+    def is_finite(self) -> bool:
+        return self.dataset.is_finite()
+
+    async def current_len(self) -> Optional[int]:
+        return await self.dataset.current_len()
+
+    def _maybe_fold_in_key(self, key, index):
+        if key is not None:
+            key = jax.random.fold_in(key, index)
+        return key
+
+    async def get_batch(self, indices: Sequence[int]) -> Sequence[U]:
+        items = await self.dataset.get_batch(indices)
+        return [self._call_fn(i, item) for i, item in zip(indices, items)]
+
+    async def getitem_async(self, index: int) -> U:
+        return self._call_fn(index, await self.dataset.getitem_async(index))
+
+    async def wait_until_len_at_least(self, length: int) -> int:
+        return await self.dataset.wait_until_len_at_least(length)
 
-        while len(buffer) > 0:
-            current_key, subkey = jrandom.split(current_key)
-            i = jrandom.randint(subkey, (), 0, len(buffer))
-            yield buffer[i]
-            del buffer[i]
+    def _call_fn(self, index, item):
+        if "key" in self._extra_kwargs:
+            key = self._maybe_fold_in_key(self._extra_kwargs["key"], index)
+            kwargs = {**self._extra_kwargs, "key": key}
+        else:
+            kwargs = self._extra_kwargs
+        return self.fn(item, *self._extra_args, **kwargs)
diff --git a/src/levanter/data/loader.py b/src/levanter/data/loader.py
index b6e7f673f..ab97e0827 100644
--- a/src/levanter/data/loader.py
+++ b/src/levanter/data/loader.py
@@ -1,229 +1,257 @@
-import abc
 import functools
 import logging
+import time
 from collections import defaultdict
-from typing import Callable, Dict, Iterable, Iterator, List, Optional, Tuple, TypeVar, Union
+from typing import Iterable, Iterator, Optional, Tuple, TypeVar
 
 import jax
-import jax.numpy as jnp
-import jax.tree_util as jtu
+from jax import Array
+from jax import numpy as jnp
+from jax import tree_util as jtu
 from jax.experimental import multihost_utils
 from jax.sharding import Mesh, PartitionSpec
-from jaxtyping import Array, PyTree
+from jaxtyping import PyTree
 
 import haliax as hax
-from haliax import NamedArray
+from haliax import is_named_array
+from haliax._src.util import index_where
 from haliax.partitioning import ResourceMapping
-from haliax.util import is_named_array
 
-from levanter.data import Dataset
-from levanter.data.dataset import ShardableDataset
-from levanter.mesh import local_devices_mapping, process_mesh_mapping
+from levanter.data.dataset import AsyncDataset
+from levanter.data.utils import batched
 from levanter.shapes import NamedShapeSpec, ShapeSpec, to_raw_shape
 from levanter.utils.background_iterable import BackgroundIterable
-from levanter.utils.py_utils import non_caching_cycle
+from levanter.utils.thread_utils import blocking_wait
 
 
 Ex = TypeVar("Ex")
 
+_TensorSliceIndex = tuple[slice, ...]
 logger = logging.getLogger(__name__)
 
-# TODO: write tests to verify this works when data spans multiple processes
 
-_TensorSliceIndex = Tuple[slice, ...]
+class DataLoader(Iterable[Ex]):
+    def __init__(
+        self,
+        Batch: hax.Axis,
+        data: AsyncDataset[Ex],
+        max_buffered_batches: Optional[int],
+        mesh: Mesh,
+        axis_resources: Optional[ResourceMapping],
+        # this is set heuristically for the typical tokenseqdataset we use. Should probably tune
+        prefetch_size: int = 32,
+    ):
+        """
 
+        TODO: document this
 
-class BatchLoader(Iterable[Ex], abc.ABC):
-    Batch: hax.Axis
-    mesh: Mesh
-    axis_resources: Optional[ResourceMapping]
+        Args:
+            Batch (hax.Axis): The batch axis
+            data (AsyncDataset[Ex]): The dataset to load from
+            max_buffered_batches (Optional[int]): The maximum number of batches to buffer. If None, the buffer is unbounded.
+             If <0, the buffer is disabled and single threaded operation is used.
+            axis_resources (Optional[ResourceMapping]): axis mapping
+            prefetch_size (int): The number of batches to prefetch at once
+            mesh (Mesh): The mesh to use
 
-    def __init__(self, max_capacity: Optional[int], axis_resources: Optional[ResourceMapping]):
-        """
-        :param max_capacity: if not None, the maximum number of batches to keep in memory at once. If <0 then load in the main thread
-        :param axis_resources:
         """
-        self.max_capacity = max_capacity
+        self.max_buffered_batches = max_buffered_batches
+        self.prefetch_size = prefetch_size
         self.axis_resources = axis_resources
+        self.data_store = data
+        self.mesh = mesh
+        self.Batch = Batch
 
-    def __iter__(self) -> Iterator[Ex]:
-        ax_resources = self.axis_resources
-        if ax_resources is None:
-            ax_resources = hax.partitioning.current_thread_local_mapping()
-
-        def produce_batches():
-            with hax.axis_mapping(ax_resources):
-                for batch in self._produce_batches():
-                    yield batch
-
-        if self.max_capacity is not None and self.max_capacity < 0:
-            yield from produce_batches()
-        else:
-            bg_iter = BackgroundIterable(produce_batches, max_capacity=self.max_capacity)
-            yield from bg_iter
-
-    @abc.abstractmethod
-    def _produce_batches(self) -> Iterator[Ex]:
-        raise NotImplementedError
+        def _exemplar_shape():
+            return blocking_wait(self.data_store.getitem_async(0))
+
+        self._ex_leaves, self._ex_structure = jax.tree_flatten(_exemplar_shape(), is_leaf=is_named_array)
+
+        local_device_indices, local_indices = self._compute_local_device_indices()
+
+        self._local_device_indices: dict[jax.Device, range] = local_device_indices
+        # this is just the flat indices
+        self._local_indices: list[int] = local_indices
+
+    def _compute_local_device_indices(self):
+        sharding: jax.sharding.Sharding = hax.partitioning.sharding_for_axis(
+            self.Batch.name, self.axis_resources, self.mesh
+        )
+        # this is a map from devices to the slice of the array that they contain (in the global array)
+        local_indices_map = sharding.addressable_devices_indices_map((self.batch_size,))
+        # we just want all the indices
+        local_device_indices: dict[jax.Device, range] = {
+            device1: range(*idx[0].indices(self.batch_size))
+            for device1, idx in local_indices_map.items()
+            if idx is not None
+        }
+        local_indices: list[int] = []
+        for device, indices in local_device_indices.items():
+            local_indices.extend(indices)
+        return local_device_indices, local_indices
 
     @property
-    def batch_size(self) -> int:
+    def batch_size(self):
         return self.Batch.size
 
-    def _construct_global_array_for_tree(self, item_exemplar: PyTree, get_batch_items: Callable[[int, int], PyTree]):
-        # ok this is a bit messy: we want to create a batch of items from our dataset, only loading
-        # the relevant data for each process.
-        # In general an item is represented as a PyTree, whose leaves are (named or unnamed) arrays.
-        # To make a batch we just want to add a leading dimension to each leaf array by stacking.
-        # That is, we have (conceptually) a List[PyTree[Array]] and we want to produce a PyTree[List[Array]]
-        # The difference is that we want to do this in a way that only loads the relevant data for each process
-        # So it's more that we have a LocalBatch[PyTree[Array]] and we want to produce a PyTree[GlobalBatch[Array]]
-        # because more than one device can get the same data, we need to make sure we only load it once since we're
-        # streaming. This is the cache
-        stacked_local_batch: Dict[Tuple[int, int], List[Array | hax.NamedArray]] = {}
-
-        def get_local_batch(begin: int, end: int) -> List[Array]:
-            key = (begin, end)
-            if key in stacked_local_batch:
-                return stacked_local_batch[key]
-
-            individual_datums = get_batch_items(begin, end)
-
-            device_batch = _stack_tree(self.Batch.name, individual_datums)
-            batch_leaves = jtu.tree_leaves(device_batch)
-
-            stacked_local_batch[key] = batch_leaves
-
-            return batch_leaves
-
-        def get_local_data_for_leaf(indices: _TensorSliceIndex, leaf_index: int) -> Array:
-            batch_slice = indices[0]
-            begin, end, _ = batch_slice.indices(self.Batch.size)
-            local_batch = get_local_batch(begin, end)
-            leaf = local_batch[leaf_index]
-            other_indices = indices[1:]
-            if all(idx == slice(None) for idx in other_indices):
-                return leaf
-            else:
-                return leaf[(..., *indices[1:])]
-
-        def make_global_array_for_leaf(leaf_index, item_leaf_shape: Union[ShapeSpec, NamedShapeSpec]):
-            raw_array = jax.make_array_from_callback(
-                to_raw_shape(item_leaf_shape),
-                jax.sharding.NamedSharding(self.mesh, self._pspec_for(item_leaf_shape)),
-                lambda indices: get_local_data_for_leaf(indices, leaf_index),
-            )
-            if isinstance(item_leaf_shape, NamedShapeSpec):
-                return NamedArray(raw_array, item_leaf_shape.shape)
-            else:
-                return raw_array
-
-        item_leaves, item_shape = jtu.tree_flatten(item_exemplar, is_leaf=is_named_array)
-
-        gda_leaves = [
-            make_global_array_for_leaf(leaf_index, _batchified_shape(self.Batch, item_leaf))
-            for leaf_index, item_leaf in enumerate(item_leaves)
-        ]
-
-        gda_tree = jtu.tree_unflatten(item_shape, gda_leaves)
-
-        return gda_tree
-
-    def _pspec_for(self, shape_spec: Union[ShapeSpec, NamedShapeSpec]) -> PartitionSpec:
+    def __iter__(self):
+        return self.iter_from_step(None)
+
+    def iter_from_step(self, start_from_batch: Optional[int] = None):
+        return DataLoaderIterator(self, start_from_batch=start_from_batch)
+
+
+class DataLoaderIterator(Iterator[Ex]):
+    def __init__(self, data_loader: DataLoader, start_from_batch: Optional[int] = None):
+        self.dl = data_loader
+        self._start_from_batch = start_from_batch
+        self.mapping = self.dl.axis_resources
+        if self.mapping is None:
+            self.mapping = hax.partitioning.current_thread_local_mapping()
+
+        # TODO: bring back non-prefetching version
+        buffered_batches = self.dl.max_buffered_batches
+        self._batches = iter(BackgroundIterable(self._produce_batches, max_capacity=buffered_batches))
+
+    def __next__(self):
+        time_start = time.time()
+        out = next(self._batches)
+        time_end = time.time()
+        if (time_end - time_start) > 0.5:
+            logger.info(f"Prefetch wasn't fast enough: {time_end - time_start:.3f}")
+        return out
+
+    async def _produce_batches(self):
+        batch_number = self._start_from_batch or 0
+        total_ex_loaded = 0
+        done = False
+        while not done:
+            next_batch_numbers = []
+            for i in range(self.dl.prefetch_size):
+                if self.dl.data_store.is_finite():
+                    next_end = (batch_number + 1) * self.dl.batch_size
+                    available_len = await self.dl.data_store.wait_until_len_at_least(next_end)
+                    if available_len < next_end:
+                        done = True
+                        break
+
+                next_batch_numbers.append(batch_number)
+                batch_number += 1
+
+            async for batch in self._retrieve_batches(next_batch_numbers):
+                yield batch
+
+            total_ex_loaded += self.dl.batch_size * len(next_batch_numbers)
+
+    async def _retrieve_batches(self, batch_numbers: list[int]):
+        with hax.axis_mapping(self.mapping), self.dl.mesh:
+            indices_for_this_batch_of_batches: list[int] = []
+            for bn in batch_numbers:
+                indices_this_batch = range(bn * self.dl.batch_size, (bn + 1) * self.dl.batch_size, 1)
+                indices_this_batch_this_process = [indices_this_batch[i] for i in self.dl._local_indices]
+                indices_for_this_batch_of_batches.extend(indices_this_batch_this_process)
+
+            time_start = time.time()
+            individual_datums = await self.dl.data_store.get_batch(indices_for_this_batch_of_batches)
+            time_end = time.time()
+            logger.debug(f"Time to get {len(batch_numbers)} batches: {time_end - time_start:.3f}")
+            time_start = time.time()
+            # reshape to be per batch
+            individual_datums = list(batched(individual_datums, len(self.dl._local_indices)))
+
+            # below we're gonna get the indices relative to this batch (i.e. 0 to batch_size)
+            index_to_datum = [
+                {index: datum for index, datum in zip(self.dl._local_indices, individual_data_batch)}
+                for individual_data_batch in individual_datums
+            ]
+
+            def get_local_batch(bn: int, begin: int, end: int) -> list:
+                # TODO: if we ever do "big data" (i.e. huge examples) we might want to be able to load part of an example
+                # which will require support from the datastore (i.e. tensorstore)
+                device_batch = _stack_tree(self.dl.Batch.name, [index_to_datum[bn][i] for i in range(begin, end)])
+                batch_leaves = hax.tree_util.tree_leaves(device_batch)
+                return batch_leaves
+
+            def get_local_data_for_leaf(bn, indices: _TensorSliceIndex, leaf_index: int) -> Array:
+                batch_slice = indices[0]
+                begin, end, stride = batch_slice.indices(self.dl.batch_size)
+                if stride != 1:
+                    raise ValueError("Stride must be 1")
+
+                leaf_data = (get_local_batch(bn, begin, end))[leaf_index]
+
+                if isinstance(leaf_data, hax.NamedArray):
+                    # select out the batch axis
+                    batch_index = index_where(lambda ax: ax.name == self.dl.Batch.name, leaf_data.axes)
+                    new_indices = list(indices)
+                    new_indices[batch_index] = slice(None)
+                    return leaf_data.array[tuple(new_indices)]
+
+                else:
+                    other_indices = indices[1:]
+                    if all(idx == slice(None) for idx in other_indices):
+                        return leaf_data
+                    else:
+                        # TODO: this doesn't work with named axes
+                        return leaf_data[(..., *other_indices)]
+
+            for batch_offset, bn in enumerate(batch_numbers):
+
+                def make_global_array_for_leaf(leaf_index, item_leaf_shape: ShapeSpec | NamedShapeSpec):
+                    def get_data(indices):
+                        return get_local_data_for_leaf(batch_offset, indices, leaf_index)
+
+                    raw_array = jax.make_array_from_callback(
+                        to_raw_shape(item_leaf_shape),
+                        jax.sharding.NamedSharding(self.dl.mesh, self._pspec_for(item_leaf_shape)),
+                        get_data,
+                    )
+                    if isinstance(item_leaf_shape, NamedShapeSpec):
+                        return hax.NamedArray(raw_array, item_leaf_shape.shape)
+                    else:
+                        return raw_array
+
+                gda_leaves = [
+                    make_global_array_for_leaf(leaf_index, _batchified_shape(self.dl.Batch, item_leaf))
+                    for leaf_index, item_leaf in enumerate(self.dl._ex_leaves)
+                ]
+
+                gda_tree = jax.tree.unflatten(self.dl._ex_structure, gda_leaves)
+                yield gda_tree
+
+    def _pspec_for(self, shape_spec: ShapeSpec | NamedShapeSpec) -> PartitionSpec:
         if isinstance(shape_spec, ShapeSpec):  # type: ignore
-            batch_name = hax.partitioning.physical_axis_name(self.Batch, self.axis_resources)
+            batch_name = hax.partitioning.physical_axis_name(self.dl.Batch, self.dl.axis_resources)
             return PartitionSpec(batch_name, *((None,) * (len(shape_spec.shape) - 1)))
         else:
-            return hax.partitioning.pspec_for_axis(shape_spec.shape, self.axis_resources)  # type: ignore
-
-
-class ShardedBatchLoader(BatchLoader[Ex]):
-    """
-    ShardedBatchLoader wraps a "local dataset" (a dataset that is shardable and can be iterated over) to produce
-    distributed/sharded jax.Arrays representing batches of data. Each array that has a global shape
-    but only has the data for some of the chunks of the array (namely, the ones on the local devices).
-    Thus, each process loads the data for its devices.
-
-    **NOTE: ShardedBatchLoader loops forever since it's hard to do coordination.**
-
-    The details are a bit complex: We have a device mesh of shape (data, model). We want each row of the device mesh to
-    get batch_size//num_rows examples. Usually, a process will be responsible for one or more entire rows, meaning
-    that it wil load data that is distinct from every other process. However, if num_cols > num_devices_per_process,
-    then some processes will need to load the same data. We use the process_mesh_position to determine which data to
-    load, by determining which row(s) of the device mesh the process is responsible for.
-
-    :arg local_dataset: a dataset that is shardable and can be iterated over
-    :arg mesh: the device mesh
-    :arg Batch: the batch size
-    :param max_capacity: if not None, the maximum number of batches to keep in memory at once. If <0 then load in the main thread
-    """
-
-    def __init__(
-        self,
-        local_dataset: ShardableDataset[Ex],
-        mesh: Mesh,
-        Batch: hax.Axis,
-        axis_resources: Optional[ResourceMapping] = None,
-        max_capacity: Optional[int] = 64,
-        *,
-        override_process_data_pos: Optional[int] = None,  # for testing
-        override_process_data_groups: Optional[int] = None,  # for testing
-    ):
-        self.mesh = mesh
-        self.Batch = Batch
-
-        process_mesh_map = process_mesh_mapping(self.mesh)
-        local_devices_map = local_devices_mapping(self.mesh)
-        process_data_pos = override_process_data_pos or process_mesh_map[jax.process_index()]
-        num_data_process_groups = override_process_data_groups or max(process_mesh_map.values()) + 1
-
-        if not override_process_data_groups:
-            assert num_data_process_groups <= jax.process_count()
-
-        self.process_data_pos = process_data_pos
-        self.num_data_process_groups = num_data_process_groups
-        assert self.Batch.size % num_data_process_groups == 0
+            return hax.partitioning.pspec_for_axis(shape_spec.shape, self.dl.axis_resources)  # type: ignore
 
-        self.process_mesh_map = process_mesh_map
-        self.local_devices_map = local_devices_map
-        self.per_device_batch_size = self.batch_size // self.mesh.devices.shape[0] // self.mesh.devices.shape[1]
 
-        self.item_dataset = local_dataset.shard(process_data_pos, num_data_process_groups)
-        super().__init__(max_capacity, axis_resources)
+def _abstractify(x):
+    def _abstractify_array(x):
+        if isinstance(x, jax.numpy.ndarray):
+            return ShapeSpec(x.shape, x.dtype)
+        elif isinstance(x, hax.NamedArray):
+            return NamedShapeSpec(x.axes, x.dtype)
 
-    def _produce_batches(self) -> Iterator[PyTree]:
-        one_item_generator = non_caching_cycle(self.item_dataset)
-        batched = _batched(one_item_generator, self.local_batch_size)
+        return x
 
-        def batch_callback(global_begin, _):
-            # global_begin is uid for DP/FSDP
-            # DP_id * per_device_bs = global_begin
-            device_pos = global_begin // self.per_device_batch_size
+    return hax.tree_util.tree_map(_abstractify_array, x)
 
-            begin = self.local_devices_map[device_pos] * self.per_device_batch_size
-            end = begin + self.per_device_batch_size
 
-            return local_batch[begin:end]
-
-        while True:
-            local_batch: List[PyTree] = next(batched)
-
-            batch = self._construct_global_array_for_tree(
-                item_exemplar=local_batch[0],
-                get_batch_items=batch_callback,
-            )
-
-            yield batch
+def _batchified_shape(Batch, leaf: hax.NamedArray | Array) -> ShapeSpec | NamedShapeSpec:
+    if is_named_array(leaf):
+        return NamedShapeSpec((Batch,) + leaf.axes, leaf.dtype)
+    else:
+        return ShapeSpec((Batch.size,) + leaf.shape, leaf.dtype)
 
-    @property
-    def batch_size(self) -> int:
-        """Returns the 'global' batch size: the effective number of examples in a batch across all devices/hosts"""
-        return self.Batch.size
 
-    @property
-    def local_batch_size(self) -> int:
-        """Returns the 'local' batch size: the number of examples in a batch on this host"""
-        return self.batch_size // self.num_data_process_groups
+def _pspec_for(self, shape_spec: ShapeSpec | NamedShapeSpec) -> PartitionSpec:
+    if isinstance(shape_spec, ShapeSpec):  # type: ignore
+        batch_name = hax.partitioning.physical_axis_name(self.Batch, self.axis_resources)
+        return PartitionSpec(batch_name, *((None,) * (len(shape_spec.shape) - 1)))
+    else:
+        return hax.partitioning.pspec_for_axis(shape_spec.shape, self.axis_resources)  # type: ignore
 
 
 @functools.partial(jax.jit, static_argnums=(0,))
@@ -234,50 +262,7 @@ def _stack_leaves_unchecked(*leaves):
         else:
             return jnp.stack(leaves)
 
-    return jax.tree_map(_stack_leaves_unchecked, *individual_datums, is_leaf=is_named_array)
-
-
-class ReplicatedBatchLoader(BatchLoader[Ex]):
-    """A batch loader that creates batches without sharded data loading. All examples are loaded on all machines and then
-    sharded. This is useful if you have a small dataset and want to make a single pass over it.
-
-    Note: this class discards the final batch if it is smaller than the batch size.
-
-    :arg item_dataset: a dataset that is shardable and can be iterated over
-    :arg mesh: the device mesh
-    :arg Batch: the batch size
-    :arg axis_resources: the resources for the batch axis
-    :param max_capacity: if not None, the maximum number of batches to keep in memory at once. If <0 then load in the main thread
-    """
-
-    def __init__(
-        self,
-        item_dataset: Dataset[Ex],
-        mesh: Mesh,
-        Batch: hax.Axis,
-        axis_resources: Optional[ResourceMapping] = None,
-        max_capacity: Optional[int] = 64,
-    ):
-        assert item_dataset is not None
-        self.item_dataset = item_dataset
-        self.mesh = mesh
-        self.Batch = Batch
-
-        super().__init__(max_capacity, axis_resources)
-
-    def _produce_batches(self):
-        for batch in _batched(self.item_dataset, self.Batch.size):
-            sharded = self._construct_global_array_for_tree(
-                item_exemplar=batch[0], get_batch_items=lambda begin, end: batch[begin:end]
-            )
-            yield sharded
-
-
-def _batchified_shape(Batch, leaf: Union[NamedArray, Array]):
-    if isinstance(leaf, NamedArray):
-        return NamedShapeSpec((Batch,) + leaf.axes, leaf.dtype)
-    else:
-        return ShapeSpec((Batch.size,) + leaf.shape, leaf.dtype)
+    return jax.tree.map(_stack_leaves_unchecked, *individual_datums, is_leaf=is_named_array)
 
 
 def check_sharded_consistency(tree: PyTree, check_disjoint_indices_are_different: bool = False):
@@ -340,12 +325,3 @@ def _to_tuple(index: Tuple[slice, ...]) -> Tuple[Tuple[int, int], ...]:
 
     for leaf in jtu.tree_leaves(tree):
         check_array(leaf)
-
-
-def _batched(item_iter, size):
-    batch = []
-    for item in item_iter:
-        batch.append(item)
-        if len(batch) == size:
-            yield batch
-            batch = []
diff --git a/src/levanter/data/metrics_monitor.py b/src/levanter/data/metrics_monitor.py
index 264229cdc..4e4619ffb 100644
--- a/src/levanter/data/metrics_monitor.py
+++ b/src/levanter/data/metrics_monitor.py
@@ -25,7 +25,6 @@
 @dataclass
 class InProgressCacheMetrics:
     rows_finished: int = 0
-    chunks_finished: int = 0
     shards_finished: int = 0
     field_counts: Dict[str, int] = dataclasses.field(default_factory=dict)
     is_finished: bool = False
@@ -63,7 +62,6 @@ def _init_progress(self, metrics):
         columns = [
             BarColumn(),
             TaskProgressColumn(),
-            TextColumn("| {task.fields[chunks_finished]} chunks", justify="center"),
             TextColumn("| {task.fields[rows_finished]} docs", justify="center"),
         ]
 
@@ -103,7 +101,6 @@ def __call__(self, metrics: InProgressCacheMetrics):
         to_log: Dict[str, Any] = {}
 
         to_log[f"{self.prefix}/shards"] = metrics.shards_finished
-        to_log[f"{self.prefix}/chunks"] = metrics.chunks_finished
         to_log[f"{self.prefix}/rows"] = metrics.rows_finished
 
         for field, count in metrics.field_counts.items():
@@ -117,7 +114,6 @@ def __call__(self, metrics: InProgressCacheMetrics):
         #     assert self.last_time is not None
         #     elapsed = time.time() - self.last_time
         #     to_log[f"{self.prefix}/shards_per_s"] = (metrics.shards_finished - self.last_metrics.shards_finished) / elapsed
-        #     to_log[f"{self.prefix}/chunks_per_s"] = (metrics.chunks_finished - self.last_metrics.chunks_finished) / elapsed
         #     to_log[f"{self.prefix}/rows_per_s"] = (metrics.rows_finished - self.last_metrics.rows_finished) / elapsed
         #
         #     for field, count in metrics.field_counts.items():
@@ -132,19 +128,28 @@ def __call__(self, metrics: InProgressCacheMetrics):
 class LoggerMetricsMonitor(MetricsMonitor):
     # TODO: I'd like to get the trainer pbar migrated to rich and just use rich everywhere, but until then,
     # we have separate logging
-    def __init__(self, logger: Optional[Union[pylogging.Logger, str]] = None, level=pylogging.INFO):
+    def __init__(
+        self,
+        logger: Optional[Union[pylogging.Logger, str]] = None,
+        level=pylogging.INFO,
+        log_interval: float | int = 30.0,
+    ):
         if isinstance(logger, str):
             logger = pylogging.getLogger(logger)
         self.logger = logger or pylogging.getLogger(__name__)
         self.level = level
+        self.log_interval = log_interval
+        self._last_log_time = time.time()
 
     def __call__(self, metrics: InProgressCacheMetrics):
         if jax.process_index() == 0:
-            self.logger.log(
-                self.level,
-                f" done: Shards: {metrics.shards_finished} | Chunks: {metrics.chunks_finished} | Docs:"
-                f" {metrics.rows_finished}",
-            )
+            if time.time() - self._last_log_time > self.log_interval:
+                self._last_log_time = time.time()
+
+                self.logger.log(
+                    self.level,
+                    f" done: Shards: {metrics.shards_finished} | Docs: {metrics.rows_finished}",
+                )
 
         if metrics.is_finished:
             self.logger.info("Cache creation finished")
diff --git a/src/levanter/data/mixture.py b/src/levanter/data/mixture.py
index ba7ae674b..eb1bdfaaf 100644
--- a/src/levanter/data/mixture.py
+++ b/src/levanter/data/mixture.py
@@ -1,12 +1,18 @@
-from typing import Dict, Iterator, Mapping, TypeVar
+import asyncio
+import warnings
+from typing import Mapping, Optional, Sequence, TypeVar
 
-import jax.random
+import jax
 import numpy as np
+from async_lru import alru_cache
+from jax.random import PRNGKey
 from jaxtyping import PRNGKeyArray
 
 from haliax.util import StringHolderEnum
 
-from levanter.data import ShardableDataset
+from levanter.data import AsyncDataset
+from levanter.utils.index import Index
+from levanter.utils.thread_utils import future_from_value
 
 
 T = TypeVar("T")
@@ -18,15 +24,19 @@ class StopStrategy(metaclass=StringHolderEnum):
     RESTART_STRATEGY = "restart"
 
 
-class MixtureDataset(ShardableDataset[T]):
+class MixtureDataset(AsyncDataset[T]):
     """
     MixtureDataset supports loading data from multiple datasets. It takes a list of datasets and yields from them
     according to the weights.
 
+    Creating a random-access MixtureDataset is challenging because we need to keep track of the current index of each
+    dataset. So solve this, we instead use "block-deterministic" mixtures, where the number of samples from each dataset
+    in each block is always identical (and we shuffle the order of the dataset ids in each block).
+
     Args:
         datasets: A dict of datasets, where the key is the name of the dataset and the value is the dataset itself
         weights: weights for each dataset
-        stop_strategy: strategy for stopping the iteration, by default RESTART_STRATEGY
+        stop_strategy: strategy for stopping the iteration, by default RESTART_STRATEGY. (Currently only RESTART_STRATEGY is supported)
             - FIRST_STOP_STRATEGY: stop when one dataset has been exhausted
             - ALL_STOP_STRATEGY: stop when all datasets have been exhausted
             - RESTART_STRATEGY: restart the dataset when it has been exhausted
@@ -35,57 +45,187 @@ class MixtureDataset(ShardableDataset[T]):
 
     def __init__(
         self,
-        datasets: Mapping[str, ShardableDataset[T]],
-        weights: Dict[str, float],
-        key: int | PRNGKeyArray,
+        datasets: Mapping[str, AsyncDataset[T]],
+        weights: dict[str, float],
+        block_size: int,
+        *,
+        randomize_blocks: bool = True,
+        key: PRNGKeyArray | int,
         stop_strategy: str = StopStrategy.RESTART_STRATEGY,
     ):
-        self.datasets = datasets
         self.weights = MixtureDataset._normalize_weights(weights)
+        self.datasets = {name: dataset for name, dataset in datasets.items() if self.weights.get(name, 0) > 0}
+        self.dataset_index = Index(self.datasets.keys())
+        self.block_size = block_size
+        # we pack index and ds id into a single 32 bit, so block size must be at most 2^16
+        if block_size >= 2**16:
+            raise ValueError(f"Block size must be at most 2^16, got {block_size}")
+
+        self.randomize_blocks = randomize_blocks
+
+        if isinstance(key, int):
+            key = PRNGKey(key)
+
+        self.key = key
 
         if stop_strategy not in StopStrategy:  # type: ignore
             raise ValueError(f"Stop strategy {stop_strategy} is not supported.")
 
-        self.stop_strategy = stop_strategy
+        # for now, just support restart strategy
+        if stop_strategy != StopStrategy.RESTART_STRATEGY:
+            raise NotImplementedError("Only restart strategy is supported for now.")
 
-        if not isinstance(key, int):
-            key = jax.random.randint(key, (), 0, 2**20).item()
+        self.stop_strategy = stop_strategy
 
-        self.key = key
+        self._counts_per_block = self._compute_expected_counts_per_block(block_size)
+        # precompute a list of ids for each block
+        # the ids contain both the dataset index and the index within the dataset
+        self._unpermuted_ids = self._compute_unpermuted_ids(self._counts_per_block)
+
+    def _compute_expected_counts_per_block(self, block_size):
+        _expected_values_per_block = np.zeros(len(self.datasets), dtype=np.int32)
+        for i, dsname in enumerate(self.dataset_index):
+            _expected_values_per_block[i] = self.weights[dsname] * block_size
+
+        # handle remainder by adding to the largest dataset
+        largest_dataset = np.argmax(_expected_values_per_block)
+        _expected_values_per_block[largest_dataset] += block_size - _expected_values_per_block.sum()
+
+        # check if any dataset has 0 samples (and nonzero weight)
+        for i, dsname in enumerate(self.dataset_index):
+            if _expected_values_per_block[i] == 0 and self.weights[dsname] > 0:
+                warnings.warn(
+                    f"Dataset {dsname} has 0 samples in the block, but weight of {self.weights[dsname]}."
+                    " Recommend increasing block size."
+                )
+
+        return _expected_values_per_block
+
+    def _compute_unpermuted_ids(self, counts_per_block):
+        unpermuted_ids = np.zeros(int(counts_per_block.sum()), dtype=np.int64)
+        start = 0
+        for i, dsname in enumerate(self.dataset_index):
+            count = counts_per_block[i]
+            unpermuted_ids[start : start + count] = (i << 16) + np.arange(count)
+            start += count
+        return unpermuted_ids
 
     @staticmethod
-    def _normalize_weights(weights: Dict[str, float]):
+    def _normalize_weights(weights: dict[str, float]):
         """Normalize the weights to sum to 1"""
         total = sum(weights.values())
         if total == 0:
             raise ValueError(f"Datasets' weights cannot sum to 0, got {weights}")
         return {name: weight / total for name, weight in weights.items() if weight > 0}
 
-    def shard(self, shard_id: int, num_shards: int) -> "MixtureDataset":
-        """Return a MixtureDataset with the sharded datasets"""
-        sharded = {name: dset.shard(shard_id, num_shards) for name, dset in self.datasets.items()}
-        my_key = int(jax.random.randint(jax.random.PRNGKey(self.key), (num_shards,), 0, 2**20)[shard_id])
-        return MixtureDataset(datasets=sharded, weights=self.weights, stop_strategy=self.stop_strategy, key=my_key)
-
-    def __iter__(self) -> Iterator[np.ndarray]:
-        iterators = {name: iter(dataset) for name, dataset in self.datasets.items()}
-        current_weights = self._normalize_weights(self.weights)
-        rng = np.random.default_rng(self.key)
-
-        while True:
-            dataset_name = rng.choice(list(current_weights.keys()), p=list(current_weights.values()))
-            try:
-                item = next(iterators[dataset_name])
-                yield item
-            except StopIteration:
-                match self.stop_strategy:
-                    case StopStrategy.RESTART_STRATEGY:
-                        iterators[dataset_name] = iter(self.datasets[dataset_name])
-                    case StopStrategy.FIRST_STOP_STRATEGY:
-                        break
-                    case StopStrategy.ALL_STOP_STRATEGY:
-                        del iterators[dataset_name]
-                        del current_weights[dataset_name]
-                        if len(current_weights) == 0:
-                            break
-                        current_weights = self._normalize_weights(current_weights)
+    async def async_len(self) -> int:
+        if self.stop_strategy == StopStrategy.RESTART_STRATEGY:
+            raise ValueError("Length is infinite for restart strategy")
+
+        raise NotImplementedError("Length is not implemented for other strategies")
+
+    async def final_length_is_known(self) -> bool:
+        if self.stop_strategy == StopStrategy.RESTART_STRATEGY:
+            return False
+
+        raise NotImplementedError("Length is not known for other strategies")
+
+    def is_finite(self) -> bool:
+        if self.stop_strategy == StopStrategy.RESTART_STRATEGY:
+            return False
+
+        return True
+
+    async def current_len(self) -> Optional[int]:
+        if self.stop_strategy == StopStrategy.RESTART_STRATEGY:
+            return None
+
+        raise NotImplementedError("Length is not known for other strategies")
+
+    @alru_cache
+    async def _get_block(self, index: int) -> Optional[np.ndarray]:
+        if not self.randomize_blocks:
+            return self._unpermuted_ids
+
+        return np.array(_compute_block_assignment(self._unpermuted_ids, index, self.key))
+
+    def _index_into_dataset_for_id(self, id: int, block_id) -> tuple[int, int]:
+        dataset_id = id >> 16
+        dataset_index = id & 0xFFFF
+        return dataset_id, dataset_index + block_id * self._counts_per_block[dataset_id]
+
+    async def get_batch(self, indices: Sequence[int]) -> Sequence[T]:
+        block_ids = np.array([idx // self.block_size for idx in indices])
+        blocks = [self._get_block(block_id) for block_id in block_ids]
+        blocks = await asyncio.gather(*blocks)
+
+        # split the indices into batches for each dataset
+        batches_per_dataset: list[list[int]] = [[] for _ in range(len(self.datasets))]
+        indices_in_final_batch: list[list[int]] = [[] for _ in range(len(self.datasets))]
+
+        assert len(indices) == len(blocks) == len(block_ids)
+
+        for batch_index, (idx, block, block_id) in enumerate(zip(indices, blocks, block_ids)):
+            index_within_block = idx % self.block_size  # which element of the block to get
+            id = block[index_within_block]  # for this block, which dataset+base dataset offset
+            dataset_id, dataset_index = self._index_into_dataset_for_id(id, block_id)
+            batches_per_dataset[dataset_id].append(dataset_index)
+            indices_in_final_batch[dataset_id].append(batch_index)
+
+        # get the batches from each dataset
+        batch_futures = []
+        for dataset_id, indices_for_dataset in enumerate(batches_per_dataset):
+            if len(indices_for_dataset) == 0:
+                batch_futures.append(future_from_value([]))
+            else:
+                dataset = self._dataset_of_id(dataset_id)
+                indices_for_dataset = await self._remap_indices(dataset, indices_for_dataset)
+                batch_futures.append(dataset.get_batch(indices_for_dataset))
+
+        batches = await asyncio.gather(*batch_futures)
+
+        # reassemble the final batch
+        final_batch = [None] * len(indices)
+
+        for dataset_id, indices_into_batch in enumerate(indices_in_final_batch):
+            for i, idx in enumerate(indices_into_batch):
+                assert final_batch[idx] is None
+                assert len(final_batch) > idx
+                final_batch[idx] = batches[dataset_id][i]
+
+        return final_batch  # type: ignore
+
+    async def getitem_async(self, index: int) -> T:
+        # simpler implementation because there's only one
+        block_id = index // self.block_size
+        index = index % self.block_size
+        permuted_ids = await self._get_block(block_id)
+        dataset_id, dataset_index = self._index_into_dataset_for_id(permuted_ids[index], block_id)
+
+        dataset = self._dataset_of_id(dataset_id)
+        dataset_index = (await self._remap_indices(dataset, [dataset_index]))[0]
+
+        return await dataset.getitem_async(dataset_index)
+
+    async def _remap_indices(self, ds, indices_into_ds):
+        """
+        Handles wrap around for datasets that have finite length
+        """
+        if self.stop_strategy == StopStrategy.RESTART_STRATEGY:
+            if ds.is_finite():
+                max_elem = max(indices_into_ds)
+                length_of_dataset = await ds.wait_until_len_at_least(max_elem + 1)
+                indices_into_ds = [idx % length_of_dataset for idx in indices_into_ds]
+
+            return indices_into_ds
+
+        raise NotImplementedError("Length is not known for other strategies")
+
+    def _dataset_of_id(self, id):
+        return self.datasets[self.dataset_index[id]]
+
+
+def _compute_block_assignment(base_ids, index, key):
+    rng = jax.random.fold_in(key, index)
+    permuted_ids = jax.random.permutation(rng, base_ids)
+    return permuted_ids
diff --git a/src/levanter/data/permutation.py b/src/levanter/data/permutation.py
new file mode 100644
index 000000000..a0f0566f4
--- /dev/null
+++ b/src/levanter/data/permutation.py
@@ -0,0 +1,135 @@
+import dataclasses
+from typing import Optional, Sequence
+
+import jax.random
+from async_lru import alru_cache
+
+from levanter.data import AsyncDataset
+from levanter.data._prp import Permutation
+from levanter.data.dataset import T_co
+
+
+class PermutationDataset(AsyncDataset[T_co]):
+    """A permutation dataset that wraps another dataset and applies a permutation to the indices."""
+
+    # TODO: add epoch reshuffling
+
+    def __init__(self, dataset: AsyncDataset[T_co], key: jax.random.PRNGKey):
+        self.dataset = dataset
+        self.key = key
+        self._permutation: Optional[Permutation] = None
+
+    async def async_len(self) -> int:
+        return await self.dataset.async_len()
+
+    async def final_length_is_known(self) -> bool:
+        return await self.dataset.final_length_is_known()
+
+    def is_finite(self) -> bool:
+        return self.dataset.is_finite()
+
+    async def current_len(self) -> Optional[int]:
+        return await self.dataset.current_len()
+
+    async def getitem_async(self, index: int) -> T_co:
+        permutation = await self._get_permutation()
+        return await self.dataset.getitem_async(permutation(index))
+
+    async def get_batch(self, indices: Sequence[int]) -> Sequence[T_co]:
+        permutation = await self._get_permutation()
+        return await self.dataset.get_batch([permutation(i) for i in indices])
+
+    async def _get_permutation(self):
+        if self._permutation is None:
+            self._permutation = Permutation(await self.dataset.async_len(), self.key)
+        return self._permutation
+
+
+class EraShufflingDataset(AsyncDataset[T_co]):
+    """
+    A dataset that shuffles the data in "eras" of fixed length. Era shuffling is somewhere in between a shuffle buffer
+    and a permutation. It's a "local" permutation where pi(i) \in [ (i//L) * L, (i//L + 1) * L ) for some era length L.
+
+    The advantages of era shuffling are:
+    - It's stateless, so resumes are easy
+    - Like shuffle buffers, it's a decent compromise between full shuffling and no shuffling
+    - Like a shuffle buffer, it's streaming: we don't need to know the length of the data in advance
+
+    The disadvantages are:
+    - It's not as good as full shuffling
+    - It distributes less well than a shuffle buffer does. It's more like a "local" shuffle buffer.
+    - You have to wait for an era to fill before you can start shuffling it. With prefetching, this is less of an issue.
+
+
+    # TODO: given the way tokenization works (where it runs way ahead of training), we can probably increase the era
+    length # over time. This would be a nice feature to have.
+    """
+
+    def __init__(self, dataset: AsyncDataset[T_co], era_length: int, *, key: jax.random.PRNGKey):
+        self.dataset = dataset
+        self.era_length = era_length
+        self.key = key
+
+        @alru_cache(maxsize=4)  # we're mostly going to be going sequentially
+        async def gen_era_permutation(era: int) -> Permutation:
+            # TODO: support epochs
+            # edge case: final era may be shorter than era_length
+            current_len = await self.dataset.wait_until_len_at_least((era + 1) * self.era_length)
+            era_length = min(self.era_length, current_len - era * self.era_length)
+
+            mix_key = jax.random.fold_in(key, era)
+            return Permutation(era_length, mix_key)
+
+        self.gen_era_permutation = gen_era_permutation
+
+    async def _get_index(self, idx: int) -> int:
+        if idx < 0:
+            raise ValueError("Negative indices are not supported")
+        era = idx // self.era_length
+        permutation = await self.gen_era_permutation(era)
+        return permutation(idx - era * self.era_length) + era * self.era_length
+
+    async def async_len(self) -> int:
+        return await self.dataset.async_len()
+
+    async def final_length_is_known(self) -> bool:
+        return await self.dataset.final_length_is_known()
+
+    def is_finite(self) -> bool:
+        return self.dataset.is_finite()
+
+    async def current_len(self) -> Optional[int]:
+        # nb this is the no-wait length, which means we might be a bit behind the length of the inner dataset
+        inner_current_len = await self.dataset.current_len()
+        if inner_current_len is None:
+            return None
+
+        # if we have the final length, and it's the inner_current_len, then we can return the final length
+        if await self.final_length_is_known() and inner_current_len == await self.async_len():
+            return inner_current_len
+
+        # otherwise, we need to wait for the era to fill
+        era = inner_current_len // self.era_length
+        return era * self.era_length
+
+    async def getitem_async(self, index: int) -> T_co:
+        return await self.dataset.getitem_async(await self._get_index(index))
+
+    async def get_batch(self, indices: Sequence[int]) -> Sequence[T_co]:
+        return await self.dataset.get_batch([await self._get_index(i) for i in indices])
+
+    def __repr__(self):
+        return f"EraShufflingDataset({repr(self.dataset)}, era_length={self.era_length})"
+
+    def __str__(self):
+        return f"EraShufflingDataset({str(self.dataset)})"
+
+    async def wait_until_len_at_least(self, length: int) -> int:
+        # wait until we hit the next era
+        next_era_end = (length // self.era_length + 1) * self.era_length
+        return await self.dataset.wait_until_len_at_least(next_era_end)
+
+
+@dataclasses.dataclass
+class EraConfig:
+    era_length: int
diff --git a/src/levanter/data/shard_cache.py b/src/levanter/data/shard_cache.py
deleted file mode 100644
index 8956412b5..000000000
--- a/src/levanter/data/shard_cache.py
+++ /dev/null
@@ -1,1521 +0,0 @@
-# Dataset for preprocessing data, tokenizing, and caching to disk.
-import asyncio
-import dataclasses
-import heapq
-import logging as pylogging
-import os
-import threading
-import time
-from contextlib import AbstractContextManager
-from dataclasses import dataclass
-from typing import IO, Any, Callable, Dict, Iterable, Iterator, List, Optional, Sequence, TypeVar
-
-import fsspec.core
-import pyarrow as pa
-import pyarrow.parquet as pq
-import ray
-from dataclasses_json import dataclass_json
-from fsspec import AbstractFileSystem
-from ray.actor import ActorHandle
-from ray.exceptions import GetTimeoutError
-
-from ..utils.ray_utils import (
-    ExceptionInfo,
-    RefBox,
-    SnitchRecipient,
-    current_actor_handle,
-    log_failures_to,
-    ser_exc_info,
-)
-from ._preprocessor import BatchProcessor, BatchResult, as_record_batch, dict_from_record_batch
-from ._queue import (
-    PriorityProcessorActor,
-    PriorityWorkItem,
-    PriorityWorkTaskGroup,
-    PriorityWorkTaskGroupSpec,
-    _BatchProcessorQueue,
-)
-from .dataset import ShardableDataset
-from .metrics_monitor import InProgressCacheMetrics, LoggerMetricsMonitor, MetricsMonitor
-from .sharded_dataset import ShardedDataset
-
-
-G = TypeVar("G")
-T = TypeVar("T")
-T_co = TypeVar("T_co", covariant=True)
-
-
-logger = pylogging.getLogger(__name__)
-
-DEFAULT_ROWS_PER_CHUNK = 8192
-DEFAULT_MAX_BYTES_PER_BATCH = 256 * 1024 * 1024  # 256 MB, this is pre-preprocessing python object size
-DEFAULT_MAX_SHARDS_TO_READ_AT_ONCE = 32
-LEDGER_FILE_NAME = "cache_ledger.json"
-
-LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-LEVEL_TO_LOG = pylogging.INFO
-
-
-def build_or_load_cache(
-    cache_dir: str,
-    input_shards: ShardedDataset[T],
-    processor: BatchProcessor[T],
-    batch_size: int = 1,
-    rows_per_chunk: int = DEFAULT_ROWS_PER_CHUNK,
-    await_finished: bool = True,
-    monitors: Optional[Sequence["MetricsMonitor"]] = None,
-    cache_config: Optional[Dict[str, Any]] = None,
-) -> "ShardCache":
-    """
-    Produces a sharded cache of the dataset using Ray for distributed processing. The cache can be any path
-    on any file system understood by fsspec.
-
-    This system is designed with tokenization and similar processes in mind, but it can potentially be used for any kind
-    of preprocessing that converts input batches to output batches. The main design goal is to make it easy to
-    parallelize preprocessing across multiple machines while maintaining reproducibility and fault tolerance.
-    Usually the machines in question are the ones doing the training, but they could be separate machines as well.
-
-    See the [Dataloader Design Doc](https://github.com/stanford-crfm/levanter/blob/main/docs/design/Data-Loader-Design.md)
-    for a somewhat out of date overview of the design.
-
-    Args:
-        cache_dir: The directory to write the cache to. This can be any path understood by fsspec.
-        input_shards: A ShardedDataset that will be used to read the input data. Conceptually, it's just a mapping
-                    from shard names to iterators over the data in that shard.
-        processor: A BatchProcessor that will be used to process batches of data. This is the main place where
-                    you can customize the preprocessing pipeline.
-        batch_size: When reading from the cache, how many examples to read at a time.
-        rows_per_chunk: The number of rows to write to each chunk. May be smaller at the end of a shard.
-        await_finished: If True, this function will block until the cache is finished. If False, it will return
-                    immediately.
-        monitors: a list of MetricsMonitors to attach to the cache. These will be called periodically with
-            metrics about the cache build process. If None, will add a LoggerMetricsMonitor.
-
-    Returns:
-       (ShardCache) A ShardCache object that can be used to read the cache.
-
-    """
-    # first see if we need to do anything
-    cache = ShardCache.build_or_load(
-        cache_dir=cache_dir,
-        shard_source=input_shards,
-        processor=processor,
-        batch_size=batch_size,
-        rows_per_chunk=rows_per_chunk,
-        cache_config=cache_config,
-    )
-
-    if cache.is_finished:
-        logger.info("Cache already finished. Skipping.")
-        return cache
-
-    if monitors is None:
-        monitors = [LoggerMetricsMonitor()]
-
-    for monitor in monitors:
-        cache.attach_metrics_monitor(monitor)
-
-    while await_finished:
-        try:
-            cache.await_finished(4.0)
-            break
-        except TimeoutError:
-            pass
-
-    return cache
-
-
-@dataclass_json
-@dataclass
-class ChunkMetadata:
-    name: str
-    num_rows: int
-    field_counts: Dict[str, int]
-
-
-@dataclass_json
-@dataclass
-class ShardMetadata:
-    chunks: List[ChunkMetadata] = dataclasses.field(default_factory=list)
-    is_finished: bool = False
-
-    @property
-    def total_rows(self):
-        return sum(chunk.num_rows for chunk in self.chunks)
-
-    @property
-    def total_chunks_produced(self):
-        return len(self.chunks)
-
-
-@dataclass_json
-@dataclass
-class CacheLedger:
-    """Written at the end of the cache build process. Contains the global chunk order."""
-
-    chunks: List[ChunkMetadata] = dataclasses.field(default_factory=list)
-    metadata: Dict[str, Any] = dataclasses.field(default_factory=dict)
-
-
-class SerialCacheWriter(AbstractContextManager):
-    """
-    Writes ShardCache-compatible caches to disk. This is a serial version of ShardCacheWriter that doesn't use Ray.
-    Mostly for scripts and debugging.
-
-    Examples:
-        >>> with SerialCacheWriter(cache_dir, rows_per_chunk=1024) as writer:
-        ...     for batch in process_batches():
-        ...         writer.write_batch(batch)
-    """
-
-    def __init__(
-        self,
-        cache_dir: str,
-        rows_per_chunk: int = DEFAULT_ROWS_PER_CHUNK,
-        cache_config: Optional[Dict[str, Any]] = None,
-    ):
-        if rows_per_chunk <= 0:
-            raise ValueError("rows_per_chunk must be positive")
-        self.cache_dir = cache_dir
-        self.cache_config = cache_config
-        self._rows_per_chunk = rows_per_chunk
-        self._chunks: List[ChunkMetadata] = []
-        self._current_chunk_writer: Optional[_ChunkWriter] = None
-        self._is_closed = False
-
-    def __enter__(self) -> "SerialCacheWriter":
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        # if successful, write the ledger
-        if self._current_chunk_writer is not None:
-            self._current_chunk_writer.__exit__(exc_type, exc_val, exc_tb)
-            self._chunks.append(self._current_chunk_writer.get_metadata())
-            self._current_chunk_writer = None
-
-        if exc_type is None:
-            _serialize_json_and_commit(
-                os.path.join(self.cache_dir, LEDGER_FILE_NAME), CacheLedger(self._chunks, self.cache_config)
-            )
-            logger.info(f"Cache ledger written to {self.cache_dir}")
-            self._is_closed = True
-
-    def result(self, batch_size: int = 1) -> "ShardCache":
-        if not self._is_closed:
-            raise RuntimeError("Cannot get result until ShardCacheWriter is closed")
-        return ShardCache.load(self.cache_dir, batch_size=batch_size)
-
-    def write_batch(self, batch: BatchResult):
-        rb = as_record_batch(batch)
-
-        while rb.num_rows > 0:
-            if self._current_chunk_writer is None:
-                self._current_chunk_writer = _ChunkWriter(
-                    self.cache_dir, f"chunk-{len(self._chunks)}", rb.schema
-                ).__enter__()
-
-            slice = rb.slice(0, min(rb.num_rows, self._rows_per_chunk - self._current_chunk_writer.num_rows))
-            self._current_chunk_writer.write_batch(slice)
-            rb = rb.slice(slice.num_rows)
-
-            if self._current_chunk_writer.num_rows >= self._rows_per_chunk:
-                self._current_chunk_writer.__exit__(None, None, None)
-                self._chunks.append(self._current_chunk_writer.get_metadata())
-                self._current_chunk_writer = None
-
-
-class _ChunkWriter:
-    def __init__(self, cache_dir: str, chunk_name: str, schema: pa.Schema):
-        self.cache_dir = cache_dir
-        self.chunk_name = chunk_name
-        self.schema = schema
-        self.file: Optional[IO] = None
-        self.writer: Optional[pq.ParquetWriter] = None
-        self.num_rows = 0
-        self.field_counts: Dict[str, int] = {}
-
-        self.is_finished = False
-
-    def __enter__(self):
-        self.file = fsspec.open(os.path.join(self.cache_dir, f"{self.chunk_name}.parquet"), "wb").__enter__()
-        self.writer = pq.ParquetWriter(self.file, self.schema, version="2.6", compression="ZSTD").__enter__()
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if self.writer is not None:
-            self.writer.__exit__(exc_type, exc_val, exc_tb)
-        if self.file is not None:
-            self.file.__exit__(exc_type, exc_val, exc_tb)
-
-        self.is_finished = True
-
-    def write_batch(self, batch: pa.RecordBatch):
-        assert not self.is_finished
-        assert self.writer is not None
-        self.writer.write_batch(batch)
-        self.num_rows += batch.num_rows
-
-        for i in range(batch.num_columns):
-            name = batch.field(i).name
-            value = batch.column(i)
-            if isinstance(value, pa.ListArray):
-                value = value.flatten()
-                self.field_counts[name] = self.field_counts.get(name, 0) + len(value)
-            elif isinstance(value, pa.ChunkedArray):
-                self.field_counts[name] = self.field_counts.get(name, 0) + value.length()
-
-    def get_metadata(self) -> ChunkMetadata:
-        if not self.is_finished:
-            raise RuntimeError("Cannot get metadata for unfinished chunk")
-        return ChunkMetadata(self.chunk_name, self.num_rows, self.field_counts)
-
-
-class _ShardMetadataWriter:
-    def __init__(self, metadata_path):
-        self.metadata_path = metadata_path
-        try:
-            with fsspec.open(self.metadata_path, "r") as file:
-                self.metadata = ShardMetadata.from_json(file.read())  # type: ignore
-        except FileNotFoundError:
-            self.metadata = ShardMetadata()
-
-    @property
-    def is_finished(self):
-        return self.metadata.is_finished
-
-    @property
-    def chunks(self):
-        return self.metadata.chunks
-
-    @property
-    def num_chunks(self):
-        return len(self.metadata.chunks)
-
-    def commit_chunk(self, chunk: ChunkMetadata):
-        assert not self.metadata.is_finished
-        self.metadata.chunks.append(chunk)
-        self._commit()
-
-    def finish(self):
-        self.metadata.is_finished = True
-        self._commit()
-
-    def _commit(self):
-        _serialize_json_and_commit(self.metadata_path, self.metadata)
-
-
-# thinking through the design of the cache system
-
-# we decided to use Ray, which was maybe a mistake, but here we are.
-# Ray doesn't like it when the number of actors gets too large, so we can't have one actor per shard.
-# we have N nodes and K shards. We want to produce chunks of size C examples, from each shards.
-# We define a global order over chunks [shard[0].chunk[0], shard[1].chunk[0], ... shard[K].chunk[0], shard[0].chunk[1], ...]
-# with the obvious extension for if one shard has more chunks than another.
-# We want to produce chunks in roughly this order, but we want to do it in parallel.
-# We also want to be able to recover from failures, and we want to be able to resume a cache build.
-
-# at a high level, we have 3 steps:
-# 1. read batches from the source
-# 2. process batches, concatenating them into chunks
-# 3. write chunks to disk
-
-# The difficulty is that we want parallelism and we want to control the order of chunks.
-# reading batches requires CPU and network. This means we should limit the number to roughly the number of nodes, maybe times 2.
-# We want to prioritize so that we read 1 chunks worth of batches from each shard before reading more from another shard.
-# We also want to prioritize reading earlier shards before later shards (within a chunk generation round).
-# Ray also seems to get upset about having too many processes, and we can't serialize the iterators over shards.
-
-
-def _shard_reader_generator(shard_source: ShardedDataset[T], shard_name: str, start_row: int, batch_size: int):
-    shard_iter = shard_source.open_shard_at_row(shard_name, start_row)
-    batch = []
-    for row in shard_iter:
-        batch.append(row)
-
-        if len(batch) == batch_size:
-            yield batch
-            batch = []
-
-    if len(batch) > 0:
-        yield batch
-
-
-@dataclass
-class ShardGroupToBeProcessed(PriorityWorkTaskGroupSpec):
-    name: str
-    builder_ref: ray.actor.ActorHandle  # _ChunkCacheBuilder
-    writer: ray.actor.ActorHandle  # _GroupedShardWriter
-    shard_source: ShardedDataset
-    shard_names: Sequence[str]
-    priority_fn: Callable[[int, int], float]
-    processor_actor: ray.actor.ActorHandle  # BatchProcessorQueue
-    batch_size: int
-    num_rows_per_chunk: int
-    group_id: int
-
-    def build(self) -> "PriorityWorkTaskGroup":
-        return ShardGroupTaskGroup(self)
-
-
-class ShardGroupTaskGroup(PriorityWorkTaskGroup):
-    def __init__(self, spec: ShardGroupToBeProcessed):
-        self.spec = spec
-        self.logger = pylogging.getLogger(f"shard_reader.{spec.group_id}.{spec.name}")
-
-        try:
-            metadata: dict[str, ShardMetadata] = _initial_shard_metadatas(
-                self.spec.shard_source, self.spec.shard_names, self.spec.writer
-            )
-        except Exception as e:
-            self.spec.builder_ref.other_failed.remote(ser_exc_info())
-            raise e
-
-        batch_size = min(self.spec.batch_size, self.spec.num_rows_per_chunk)
-
-        self._items: list[PriorityWorkItem] = []
-
-        for shard_name in self.spec.shard_names:
-            shard_idx = self.spec.shard_source.shard_names.index(shard_name)
-            try:
-                shard_metadata = metadata[shard_name]
-                reader = _shard_reader_generator(
-                    self.spec.shard_source, shard_name, shard_metadata.total_rows, batch_size
-                )
-
-                if shard_metadata.is_finished:
-                    self.logger.info(f"Shard {shard_name} already finished. Skipping.")
-
-                task_name = f"shard_reader.{self.spec.name}.{shard_name}"
-
-                chunk_idx = len(shard_metadata.chunks)
-                item = ShardReaderItem(self, task_name, shard_name, shard_idx, chunk_idx, reader)
-
-                heapq.heappush(self._items, item)
-            except Exception as e:
-                self.logger.exception(f"Error while initializing shard {shard_name}")
-                self.spec.writer[shard_name].shard_failed.remote(ser_exc_info())
-                raise e
-
-    @property
-    def name(self):
-        return self.spec.name
-
-    def items(self) -> Sequence["PriorityWorkItem"]:
-        return self._items
-
-
-# NB This class is stateful
-@dataclass
-class ShardReaderItem(PriorityWorkItem):
-    """
-    Each time execute is called, this class reads one chunk's worth of batches from the shard
-    and dispatches them to the processor.
-    """
-
-    group: ShardGroupTaskGroup
-    name: str
-    shard_name: str
-    shard_idx: int
-    chunk_idx: int
-    reader: Iterator[list]
-
-    @property
-    def priority(self):
-        return self.group.spec.priority_fn(self.shard_idx, self.chunk_idx)
-
-    @property
-    def spec(self):
-        return self.group.spec
-
-    def execute(self) -> tuple[bool, Optional[ray.ObjectRef]]:
-        exhausted_shard = False
-        writer = self.spec.writer
-
-        chunk_batch_idx = 0  # the index of the batch within the chunk
-        chunk_filled = False  # whether or not we've filled the chunk to max size
-        total_chunk_rows = 0  # the total number of rows in the chunk
-        batch_result_ref = None
-
-        self.group.logger.debug(f"Reading one chunk of shard {self.shard_name}: {self.chunk_idx}")
-
-        try:
-            while not chunk_filled:
-                batch = next(self.reader, None)
-                if batch is None:
-                    exhausted_shard = True
-                    break
-
-                exhausted_shard = len(batch) < self.spec.batch_size
-                total_chunk_rows += len(batch)
-
-                if batch:
-                    priority = self.spec.priority_fn(self.shard_idx, self.chunk_idx)
-                    # these times aren't exact because the times might be from different machines
-                    # but they're just for logging
-                    time_in = time.time()
-                    batch_result_ref = ray.get(
-                        self.spec.processor_actor.submit.remote(
-                            priority=priority,
-                            desc=f"{self.shard_name}.{self.chunk_idx}.{chunk_batch_idx}",
-                            batch=RefBox(ray.put(batch)),
-                        )
-                    )
-                    writer.chunk_batch_finished.remote(
-                        self.shard_name, self.chunk_idx, chunk_batch_idx, RefBox(batch_result_ref), time_in
-                    )
-                    chunk_batch_idx += 1
-                    del batch
-
-                if total_chunk_rows >= self.spec.num_rows_per_chunk or exhausted_shard:
-                    chunk_filled = True
-
-            if chunk_batch_idx > 0:
-                writer.chunk_finished_reading.remote(self.shard_name, self.chunk_idx, chunk_batch_idx)
-                old_prio = self.priority
-                self.chunk_idx += 1
-                assert self.priority > old_prio
-
-            if exhausted_shard:
-                writer.shard_finished_reading.remote(self.shard_name, self.chunk_idx)
-
-            self.group.logger.debug(
-                f"Finished reading one chunk of shard {self.shard_name}: {self.chunk_idx} {exhausted_shard}"
-            )
-
-            return exhausted_shard, batch_result_ref
-        except Exception as e:  # noqa
-            self.group.logger.exception(f"Error while processing shard {self.shard_name}")
-            # fire and forget
-            writer.shard_failed.remote(self.shard_name, ser_exc_info())
-            raise e
-
-
-def _initial_shard_metadatas(shard_source, shard_names, shard_group_writer):
-    shard_metadatas: dict[str, ShardMetadata] = {}
-    _metadata_futures = [shard_group_writer.current_metadata.remote(name) for name in shard_names]
-    shard_metadatas_rs = ray.get(_metadata_futures)
-    for shard_name, shard_metadata in zip(shard_names, shard_metadatas_rs):
-        shard_metadatas[shard_name] = shard_metadata
-    return shard_metadatas
-
-
-def _serialize_json_and_commit(path, obj):
-    # just to be paranoid, we write to a temp file and then rename it
-    # TODO: probably we could do better here
-    with fsspec.open(f"{path}.tmp", "w") as file:
-        file.write(obj.to_json())
-    # now copy the old file to a backup
-    fs: AbstractFileSystem = fsspec.core.url_to_fs(path)[0]
-    fs.mkdirs(os.path.dirname(path), exist_ok=True)
-    if fs.exists(path):
-        fs.copy(path, f"{path}.bak")
-    fs.rename(f"{path}.tmp", path)
-
-
-def _load_cache_ledger(cache_dir) -> CacheLedger:
-    try:
-        ledger_path = os.path.join(cache_dir, LEDGER_FILE_NAME)
-        logger.debug(f"Attempting to load cache ledger from {ledger_path}")
-        with fsspec.open(ledger_path) as file:
-            cache_ledger = CacheLedger.from_json(file.read())  # type: ignore
-        return cache_ledger
-    except FileNotFoundError:
-        raise FileNotFoundError(f"Cache ledger not found at {ledger_path}")
-
-
-@dataclass
-class _ShardStatus:
-    num_chunks_sent: int = 0
-    current_buffer: list[ChunkMetadata] = dataclasses.field(default_factory=list)
-    expected_num_chunks: Optional[int] = None
-
-    def pop_chunk_to_send(self) -> Optional[ChunkMetadata]:
-        if len(self.current_buffer) == 0:
-            return None
-        else:
-            self.num_chunks_sent += 1
-            return self.current_buffer.pop(0)
-
-    @property
-    def is_finished_and_buffer_empty(self):
-        return self.expected_num_chunks is not None and self.num_chunks_sent >= self.expected_num_chunks
-
-
-# Ray does poorly with large numbers of actors (grumble grumble), so we can't have one actor per shard.
-# This class wraps a map of shard names to _ShardWriterWorkers, and manages the lifecycle of the workers.
-@ray.remote(num_cpus=0.0, scheduling_strategy="SPREAD")  # type: ignore
-class _GroupShardWriterWorker:
-    def __init__(self, parent_ref, cache_dir: str, shard_names: Sequence[str]):
-        with log_failures_to(parent_ref):
-            pylogging.basicConfig(level=LEVEL_TO_LOG, format=LOG_FORMAT)
-            self.cache_dir = cache_dir
-            self.shard_names = shard_names
-            self.shard_writers: dict[str, _ShardWriterWorker] = {
-                shard_name: _ShardWriterWorker(parent_ref, cache_dir, shard_name) for shard_name in shard_names
-            }
-
-    def current_metadata(self, shard_name: str):
-        return self.shard_writers[shard_name].current_metadata()
-
-    async def chunk_batch_finished(self, shard_name: str, chunk_id: int, batch_idx: int, batch: RefBox, time_in):
-        # batch is a pa.RecordBatch ref box
-        try:
-            time_mid = time.time()
-            logger.debug(
-                f"Received in progress batch {batch_idx} of chunk {chunk_id} of shard {shard_name} in"
-                f" {time_mid - time_in}"
-            )
-            # do a backoff loop until the batch is actually processed. log if it's been a while
-            timeout_interval = 20
-            total_time_waited = 0
-
-            while True:
-                try:
-                    # batch = await asyncio.wait_for(asyncio.shield(batch.ref), timeout_interval)
-                    batch = await batch.ref
-                    break
-                except asyncio.TimeoutError:
-                    # to keep to round numbers, we log how much we asked for rather than how much we got
-                    total_time_waited += timeout_interval
-                    timeout_interval = min(2 * timeout_interval, 100)
-                    logger.info(
-                        f"Waiting for {shard_name}.{chunk_id}.{batch_idx} to be processed. "
-                        f"Waited {total_time_waited} seconds."
-                    )
-
-            if logger.isEnabledFor(pylogging.DEBUG):
-                logger.debug(
-                    f"Received finished {shard_name}.{chunk_id}.{batch_idx} in {(time.time() - time_in):.2f} seconds."
-                )
-            elif total_time_waited > 40:
-                logger.info(
-                    f"Waited {total_time_waited} seconds for {shard_name}.{chunk_id}.{batch_idx} to be processed."
-                )
-            return self.shard_writers[shard_name].chunk_batch_finished(chunk_id, batch_idx, batch)
-        except Exception as e:
-            print(f"Error while processing batch {batch_idx} of chunk {chunk_id} of shard {shard_name}", flush=True)
-            self.shard_writers[shard_name].chunk_failed(chunk_id, ser_exc_info())
-            raise e
-
-    def chunk_finished_reading(self, shard_name: str, chunk_id: int, expected_num_batches: int):
-        return self.shard_writers[shard_name].chunk_finished_reading(chunk_id, expected_num_batches)
-
-    def chunk_failed(self, shard_name: str, chunk_id: int, error: ExceptionInfo):
-        return self.shard_writers[shard_name].chunk_failed(chunk_id, error)
-
-    def shard_finished_reading(self, shard_name: str, expected_num_chunks: int):
-        return self.shard_writers[shard_name].shard_finished_reading(expected_num_chunks)
-
-    def shard_failed(self, shard_name: str, error: ExceptionInfo):
-        return self.shard_writers[shard_name].shard_failed(error)
-
-
-class _ShardWriterWorker:  # type: ignore
-    """
-    Actor that writes chunks to disk and updates the ShardMetadata. It reports to the ChunkCacheBroker
-    """
-
-    def __init__(
-        self,
-        parent_ref: ActorHandle,  # ChunkCacheBuilder
-        cache_dir: str,
-        shard_name: str,
-    ):
-        pylogging.basicConfig(level=LEVEL_TO_LOG, format=LOG_FORMAT)
-        self.parent_ref = parent_ref
-        self.cache_dir = cache_dir
-        self.shard_name = shard_name
-        self.uncommited_chunks: list[tuple[int, ChunkMetadata]] = []  # heapq of (chunk index, chunk)
-
-        self.metadata_writer = _ShardMetadataWriter(os.path.join(cache_dir, f"{shard_name}.json"))
-        self._expected_num_chunks: Optional[int] = None
-
-        if self.metadata_writer.num_chunks > 0:
-            self.parent_ref.new_chunk.remote(shard_name, *self.metadata_writer.chunks)
-
-        if self.metadata_writer.is_finished:
-            self._expected_num_chunks = self.metadata_writer.num_chunks
-            self.parent_ref.shard_finished.remote(self.shard_name, self._expected_num_chunks)
-            self.finished = True
-        else:
-            self.finished = False
-
-        self.collator = _ChunkCollator(cache_dir, shard_name)
-
-    def current_metadata(self):
-        return self.metadata_writer.metadata
-
-    # forward some methods to the collator, handle any metadata that comes back
-    def chunk_batch_finished(self, chunk_id: int, batch_idx: int, batch: pa.RecordBatch):
-        metadata = self.collator.new_batch(chunk_id, batch_idx, batch)
-        if metadata is not None:
-            self._finished_chunk(chunk_id, metadata)
-
-        return metadata
-
-    def chunk_finished_reading(self, chunk_id: int, expected_num_batches: int):
-        metadata = self.collator.chunk_finished_reading(chunk_id, expected_num_batches)
-        if metadata is not None:
-            self._finished_chunk(chunk_id, metadata)
-
-        return metadata
-
-    def chunk_failed(self, chunk_id: int, error: ExceptionInfo):
-        self.collator.chunk_failed(chunk_id, error)
-        print(f"Error while processing chunk {chunk_id} of shard {self.shard_name}", flush=True)
-        self.parent_ref.shard_failed.remote(self.shard_name, error)
-
-    def _finished_chunk(self, idx: int, chunk: ChunkMetadata):
-        if (idx < self.metadata_writer.num_chunks) or (
-            self._expected_num_chunks is not None and idx >= self._expected_num_chunks
-        ):
-            logger.error(f"Received chunk {idx} for {self.shard_name} but it's already finished")
-            error = RuntimeError(f"Received chunk {idx} for {self.shard_name} but it's already finished")
-            self.parent_ref.shard_failed.remote(self.shard_name, ser_exc_info(error))
-            raise error
-
-        heapq.heappush(self.uncommited_chunks, (idx, chunk))
-        self._attempt_to_commit_chunks()
-
-    def shard_finished_reading(self, expected_num_chunks: int):
-        # TODO: add metadata that we're done reading to metrics
-        self._expected_num_chunks = expected_num_chunks
-        self._attempt_to_commit_chunks()
-
-    def shard_failed(self, error: ExceptionInfo):
-        self.parent_ref.shard_failed.remote(self.shard_name, error)
-
-    def _attempt_to_commit_chunks(self):
-        chunks_committed = []
-        while len(self.uncommited_chunks) > 0 and self.uncommited_chunks[0][0] == self.metadata_writer.num_chunks:
-            _, chunk = heapq.heappop(self.uncommited_chunks)
-            chunk_number = self.metadata_writer.num_chunks
-            logger.debug(f"Committing chunk {chunk.name} of shard {self.shard_name}. It is chunk {chunk_number}")
-            self.metadata_writer.commit_chunk(chunk)
-            chunks_committed.append(chunk)
-
-        if len(chunks_committed) > 0:
-            if self.finished:
-                raise RuntimeError("Tried to commit chunks after shard finished")
-            # TODO: this is called inside an async call so we need to not block, but we do need to sequence
-            # this to come before the shard_finished
-            self.parent_ref.new_chunk.remote(self.shard_name, *chunks_committed)
-
-        if not self.finished and self.metadata_writer.num_chunks == self._expected_num_chunks:
-            self.metadata_writer.finish()
-            self.finished = True
-            self.parent_ref.shard_finished.remote(self.shard_name, self._expected_num_chunks)
-
-
-class _ChunkCollator:
-    """
-    This class is responsible for taking batches from the processor and writing them to disk in order.
-    It also handles the logic of when to commit chunks to disk.
-
-    For each chunk (that is has data for and hasn't finished), it keeps a heapq of batches that have been
-    processed but not yet written to disk. When a new batch comes in, it checks if it's the next batch in the
-    chunk. If so, it writes it to disk and flushes any other batches that are ready to be written.
-
-    A chunk isn't finished until it's received all the batches it's expecting and it knows how many batches
-    to expect.
-
-    """
-
-    def __init__(self, cache_dir: str, shard_name: str):
-        self.cache_dir = cache_dir
-        self.shard_name = shard_name
-        self.chunk_writers: dict[int, _ChunkWriter] = {}  # chunk index -> writer
-        self.batch_counts: dict[int, int] = {}  # chunk index -> number of batches written
-        self.expected_totals: dict[int, int] = {}  # chunk index -> expected num batches.
-        self.failed_chunks: dict[int, ExceptionInfo] = {}  # chunk index -> error
-        self.chunk_partial_batches: dict[
-            int, list[tuple[int, pa.RecordBatch]]
-        ] = {}  # chunk index -> heapq of (batch index, batch)
-
-    def new_batch(self, chunk_id, batch_idx, batch) -> Optional[ChunkMetadata]:
-        if chunk_id not in self.chunk_partial_batches:
-            self.chunk_partial_batches[chunk_id] = []
-            self.batch_counts[chunk_id] = 0
-
-        heapq.heappush(self.chunk_partial_batches[chunk_id], (batch_idx, batch))
-
-        return self._attempt_to_write_chunk_fragments(chunk_id)
-
-    def chunk_finished_reading(self, chunk_id, expected_num_batches) -> Optional[ChunkMetadata]:
-        self.expected_totals[chunk_id] = expected_num_batches
-        return self._attempt_to_write_chunk_fragments(chunk_id)
-
-    def chunk_failed(self, chunk_id, error: ExceptionInfo):
-        self.failed_chunks[chunk_id] = error
-        if chunk_id in self.chunk_writers:
-            self.chunk_writers[chunk_id].__exit__(*error.restore())
-            del self.chunk_writers[chunk_id]
-
-    def _attempt_to_write_chunk_fragments(self, chunk_id) -> Optional[ChunkMetadata]:
-        if chunk_id in self.failed_chunks:
-            logger.error(f"Chunk {chunk_id} of shard {self.shard_name} already failed, not writing more")
-            raise self.failed_chunks[chunk_id].restore()
-
-        if chunk_id in self.chunk_partial_batches:
-            chunk_batches = self.chunk_partial_batches[chunk_id]
-
-            while len(chunk_batches) > 0:
-                batch_id, batch = chunk_batches[0]
-                if batch_id != self.batch_counts[chunk_id]:
-                    break
-
-                # we can write this batch
-                batch_id, batch = heapq.heappop(chunk_batches)
-
-                if chunk_id not in self.chunk_writers:
-                    assert batch_id == 0, f"Expected batch 0 but got {batch_id}"
-                    chunk_name = os.path.join(self.shard_name, f"chunk-{chunk_id}")
-                    writer = _ChunkWriter(self.cache_dir, chunk_name, batch.schema)
-                    writer.__enter__()
-                    self.chunk_writers[chunk_id] = writer
-
-                self.chunk_writers[chunk_id].write_batch(batch)
-                self.batch_counts[chunk_id] += 1
-
-        if chunk_id not in self.batch_counts:
-            return None
-
-        if chunk_id in self.expected_totals and self.batch_counts[chunk_id] == self.expected_totals[chunk_id]:
-            assert len(chunk_batches) == 0
-            # we're done with this chunk
-            writer = self.chunk_writers[chunk_id]
-            writer.__exit__(None, None, None)
-            del self.chunk_writers[chunk_id]
-            del self.batch_counts[chunk_id]
-            del self.chunk_partial_batches[chunk_id]
-            return writer.get_metadata()
-        else:
-            return None
-
-
-@ray.remote(num_cpus=0.5)  # keep this small b/c it doesn't do a lot
-class ChunkCacheBuilder(SnitchRecipient):
-    """
-    Actor that manages the in-progress global ordering on chunks. ChunkCacheWriter's job is to hold the list of all
-    chunks as well as chunks from each shard while caching is running.
-
-    This is a separate actor from the ChunkCacheBroker because
-    we need something that gets messages from shards in-order, and async methods make actors
-    lose that property.
-    """
-
-    def __init__(
-        self,
-        broker_ref,
-        cache_dir: str,
-        name: str,
-        source: ShardedDataset[T],
-        processor: BatchProcessor[T],
-        rows_per_chunk: int,
-    ):
-        with log_failures_to(broker_ref):
-            pylogging.basicConfig(level=pylogging.INFO, format=LOG_FORMAT)
-            self.logger = pylogging.getLogger(f"{__name__}.{name}")
-            self.broker_ref = broker_ref
-            self.shard_status: Dict[str, _ShardStatus] = dict()
-            self._current_round_robin = []
-            self.source = source
-            self._metrics = InProgressCacheMetrics()
-
-            self_ref = current_actor_handle()
-
-            if len(source.shard_names) == 0:
-                self.logger.warning("No shards to index?!?")
-                self._finish()
-            else:
-                self.logger.info(f"Starting cache build for {len(source.shard_names)} shards")
-
-                self._shard_writers = []
-                self._shard_readers = []
-                self._processor_actors = []
-
-                for shard_name in source.shard_names:
-                    self._current_round_robin.append(shard_name)
-                    self.shard_status[shard_name] = _ShardStatus()
-
-                num_shards = len(source.shard_names)
-                num_worker_groups = len(ray.nodes())
-                num_shard_groups = max(min(num_worker_groups, num_shards), 1)
-
-                # if we have a bunch of caches to build with one shard, we don't want them all
-                # assigned to the same node, so we use an offset based on the hash of the name (for stability)
-                # in an attempt to spread them out
-                group_offset = int(hash(name) % num_worker_groups)
-
-                shard_groups: list[list[str]] = [[] for _ in range(num_shard_groups)]
-                for i, shard_name in enumerate(source.shard_names):
-                    shard_groups[i % num_shard_groups].append(shard_name)
-
-                def priority_fn(shard_idx, chunk_idx):
-                    return chunk_idx * num_shards + shard_idx
-
-                for group_id, shard_group in enumerate(shard_groups):
-                    writer = _GroupShardWriterWorker.remote(self_ref, cache_dir, shard_group)  # type: ignore
-                    self._shard_writers.append(writer)
-
-                    # TODO: would probably be better if we didn't create one of these per shard group
-                    processor_actor = _BatchProcessorQueue.remote(processor)  # type: ignore
-                    self._processor_actors.append(processor_actor)
-
-                    work_item = ShardGroupToBeProcessed(
-                        name=name,
-                        builder_ref=self_ref,
-                        writer=writer,
-                        shard_source=source,
-                        shard_names=shard_group,
-                        priority_fn=priority_fn,
-                        processor_actor=processor_actor,
-                        batch_size=processor.batch_size,
-                        num_rows_per_chunk=rows_per_chunk,
-                        group_id=group_id,
-                    )
-
-                    # we want global names so that different tasks can coordinate priorities
-                    worker_to_assign = (group_id + group_offset) % num_worker_groups
-                    priority_actor_name = f"priority_processor.{worker_to_assign}"
-
-                    reader_actor = PriorityProcessorActor.options(  # type: ignore
-                        name=priority_actor_name, get_if_exists=True
-                    ).remote()
-
-                    reader_actor.assign_work.remote(work_item)
-
-                    self._shard_readers.append(reader_actor)
-
-    def new_chunk(self, shard_name: str, *chunks: ChunkMetadata):
-        """Callback method for when a shard worker has produced a new chunk."""
-        self.shard_status[shard_name].current_buffer.extend(chunks)
-
-        # if we have buffered chunks, we need to check if we can send them to the broker
-        self._attempt_to_flush_buffers()
-
-        self._metrics.chunks_finished += len(chunks)
-        # update metrics
-        for chunk in chunks:
-            self._metrics.rows_finished += chunk.num_rows
-            for field, count in chunk.field_counts.items():
-                self._metrics.field_counts[field] = self._metrics.field_counts.get(field, 0) + count
-
-        if len(chunks) > 0:
-            ray.get(self.broker_ref._new_metrics.remote(self._metrics))
-
-    def shard_finished(self, shard_name: str, expected_num_chunks: int):
-        """Callback method for when a shard worker has finished."""
-        shard_status = self.shard_status[shard_name]
-        assert (
-            shard_status.expected_num_chunks is None
-        ), f"Shard {shard_name} already finished: {shard_status.expected_num_chunks} {expected_num_chunks}"
-        shard_status.expected_num_chunks = expected_num_chunks
-
-        # we might still have buffered chunks, so we need to check if we can append them
-        self._attempt_to_flush_buffers()
-        self._metrics.shards_finished += 1
-        ray.get(self.broker_ref._new_metrics.remote(self._metrics))
-
-        # if there are no more active shards, we're done
-        if self._all_shards_done():
-            assert len(self._current_round_robin) == 0
-            self._finish()
-
-    def _all_shards_done(self):
-        return all(status.is_finished_and_buffer_empty for status in self.shard_status.values())
-
-    def shard_failed(self, shard_name: str, error: ExceptionInfo):
-        """Callback method for when a shard worker has failed."""
-        ray.get(self.broker_ref._writer_exception.remote(shard_name, error))
-
-    def other_failed(self, error: ExceptionInfo):
-        """Callback method for when a shard worker has failed."""
-        ray.get(self.broker_ref._writer_exception.remote(None, error))
-
-    def _attempt_to_flush_buffers(self):
-        # this is the most complex logic in this class.
-        # The global order on chunks is defined as a roundrobin over shards, until one shard is done.
-        # After that, that shard is removed from the roundrobin and the process continues.
-        # Roundrobin order is determined by self.source.shard_names
-
-        # We are happy to release chunks that form a prefix of the global order so that they can be read.
-        # To do that, we maintain the roundrobin order in self._current_round_robin
-        # and we maintain the current buffer for each shard in self.shard_status.
-        # When we get a new chunk, we append it to the buffer for that shard.
-        # When we get a finished message, we mark that shard as finished.
-        # In either case, we check if we can send any chunks from the front of the roundrobin.
-        # If we can, we send them to the broker
-
-        # here "finished" means that the shard has sent all of its chunks and has told us that it's done.
-
-        chunks_to_send = []
-
-        while len(self._current_round_robin) > 0:
-            name = self._current_round_robin[0]
-            status = self.shard_status[name]
-            if status.is_finished_and_buffer_empty:
-                # we're done with this shard, so we can remove it from the roundrobin
-                self._current_round_robin.pop(0)
-                logger.debug(f"Shard {name} is finished, removing from round robin")
-                continue
-
-            # now let's see if we can send a chunk from this shard
-            next_chunk = status.pop_chunk_to_send()
-            if next_chunk is not None:
-                # we can send a chunk from this shard
-                self._current_round_robin.pop(0)
-                self._current_round_robin.append(name)
-                chunks_to_send.append(next_chunk)
-                continue
-            else:
-                # we can't send a chunk from this shard, so we can't send any additional chunks
-                if self.logger.level <= pylogging.DEBUG:
-                    chunks_waiting = [
-                        f"{n2} ({len(s2.current_buffer)})"
-                        for n2, s2 in self.shard_status.items()
-                        if len(s2.current_buffer) > 0
-                    ]
-                    msg = (
-                        f"Shard {name} has no chunks to send and is not known to be finished. We have this many queued"
-                        f" chunks: {chunks_waiting}"
-                    )
-                    self.logger.debug(msg)
-                break
-
-        if len(chunks_to_send) > 0:
-            logger.debug(f"Sending {len(chunks_to_send)} chunks to broker")
-            ray.get(self.broker_ref._append_chunks.remote(*chunks_to_send))
-
-    def _finish(self):
-        self._metrics.is_finished = True
-        ray.get(self.broker_ref._new_metrics.remote(self._metrics))
-        ray.get(self.broker_ref._finalize.remote())
-        # self._shard_writers = []
-        # self._shard_readers = []
-
-
-@ray.remote(num_cpus=0)
-class ChunkCacheBroker(SnitchRecipient):
-    """Actor that manages the global order on chunks and vends chunk metadata to readers."""
-
-    chunks: List[ChunkMetadata]
-    _reader_promises: Dict[int, asyncio.Future[ChunkMetadata]]
-    _finished_promise: asyncio.Future[None]
-
-    def __init__(
-        self,
-        cache_dir: str,
-        source: ShardedDataset[T],
-        processor: BatchProcessor[T],
-        rows_per_chunk: int,
-        cache_config: Optional[Dict[str, Any]],
-    ):
-        pylogging.basicConfig(level=pylogging.INFO, format=LOG_FORMAT)
-        self.chunks = []
-        self._reader_promises = {}
-        self._is_finished = False
-        self._source = source
-        self._processor = processor
-        self._cache_dir = cache_dir
-        self._rows_per_chunk = rows_per_chunk
-        self._finished_promise = asyncio.Future()
-        # used to subscribe to metrics updates
-        self._latest_metrics = InProgressCacheMetrics()
-        self._metrics_condition = asyncio.Condition()
-        self._cache_config = cache_config
-        path_for_name = os.path.join(*self._cache_dir.split("/")[-2:])
-        name = f"broker::{path_for_name}"
-        self.logger = pylogging.getLogger(f"{name}")
-
-        # initialize writer task
-        # first see if we need to do anything: check the ledger for is_finished
-        try:
-            cache_ledger = _load_cache_ledger(self._cache_dir)
-            self._append_chunks(*cache_ledger.chunks)
-            self._is_finished = True
-            self._finished_promise.set_result(None)
-        except FileNotFoundError:
-            self_ref = ray.runtime_context.get_runtime_context().current_actor
-            # only use the last two components of the name since it gets kind of long
-            name = f"builder::{path_for_name}"
-            self._builder_actor = ChunkCacheBuilder.remote(  # type: ignore
-                self_ref,
-                self._cache_dir,
-                name,
-                self._source,
-                self._processor,
-                rows_per_chunk,
-            )  # type: ignore
-
-    def is_finished(self):
-        return self._is_finished
-
-    async def finished_sentinel(self):
-        await self._finished_promise
-
-    async def updated_metrics(self) -> InProgressCacheMetrics:
-        if self._finished_promise.done():
-            if self._finished_promise.exception() is not None:
-                raise self._finished_promise.exception()  # type: ignore
-            else:
-                return self._latest_metrics
-
-        async with self._metrics_condition:
-            await self._metrics_condition.wait()
-            return self._latest_metrics
-
-    async def get_chunk(self, chunk_idx: int) -> Optional[ChunkMetadata]:
-        assert isinstance(self.chunks, list), self.chunks
-        if chunk_idx < len(self.chunks):
-            return self.chunks[chunk_idx]
-        elif self._is_finished:
-            return None
-        elif self._finished_promise.exception() is not None:
-            raise self._finished_promise.exception()  # type: ignore
-        else:
-            if chunk_idx not in self._reader_promises:
-                self._reader_promises[chunk_idx] = asyncio.Future()
-            return await self._reader_promises[chunk_idx]
-
-    async def final_chunk_count(self) -> Optional[int]:
-        if self._is_finished:
-            return len(self.chunks)
-        else:
-            return None
-
-    def _append_chunks(self, *chunks: ChunkMetadata):
-        for chunk in chunks:
-            self.chunks.append(chunk)
-            chunk_idx = len(self.chunks) - 1
-            self.logger.debug(f"Received chunk {chunk_idx}")
-            if chunk_idx in self._reader_promises:
-                self.logger.debug(f"Resolving promise for chunk {chunk_idx}")
-                self._reader_promises[chunk_idx].set_result(chunk)
-                del self._reader_promises[chunk_idx]
-
-    def _new_metrics(self, metrics):
-        self._latest_metrics = metrics
-        self._do_notify()
-
-    def _do_notify(self):
-        async def _do_notify_async():
-            async with self._metrics_condition:
-                self._metrics_condition.notify_all()
-
-        asyncio.create_task(_do_notify_async())
-
-    def _child_failed(self, child: ray.actor.ActorHandle, exception: ExceptionInfo):
-        try:
-            super()._child_failed(child, exception)
-        except Exception as e:
-            logger.exception("Error in child_failed")
-            self._writer_exception(None, ser_exc_info(e))
-
-    def _writer_exception(self, shard_name, exc_info: ExceptionInfo):
-        info = exc_info.restore()
-
-        logger.exception(f"Writer task {shard_name} failed with exception", exc_info=info)
-        for future in self._reader_promises.values():
-            future.set_exception(info[1])
-
-        self._reader_promises = {}
-
-        self._finished_promise.set_exception(info[1])
-        self._do_notify()
-
-    def _finalize(self):
-        logger.info(f"Finalizing cache {self._cache_dir}...")
-        self._is_finished = True
-        for k, future in self._reader_promises.items():
-            future.set_result(None)
-
-        # write ledger
-        _serialize_json_and_commit(
-            os.path.join(self._cache_dir, LEDGER_FILE_NAME), CacheLedger(self.chunks, self._cache_config)
-        )
-
-        self._reader_promises = {}
-        # TODO: For some reason this crashes other actors with weird reference counting assertion errors.
-        # pretty sure it's a ray bug
-        # self._builder_actor = None
-        self._finished_promise.set_result(None)
-
-        # notify metrics subscribers
-        self._do_notify()
-
-
-def _get_broker_actor(
-    cache_dir,
-    input_shards,
-    processor,
-    cache_config=None,
-    rows_per_chunk=DEFAULT_ROWS_PER_CHUNK,
-):
-    return ChunkCacheBroker.options(
-        name="lev_cache_manager::" + cache_dir.replace("/", "--"), get_if_exists=True, lifetime="detached"
-    ).remote(
-        # type: ignore
-        cache_dir=cache_dir,
-        source=input_shards,
-        processor=processor,
-        cache_config=cache_config,
-        rows_per_chunk=rows_per_chunk,
-    )
-
-
-class DictCacheDataset(ShardableDataset[dict]):
-    """
-    A Dataset that yields HF BatchEncodings from a ShardCache.
-    This basically yields a dict-of-arrays, just the HF BatchEncoding class version of dict.
-    """
-
-    def __init__(self, cache: "ShardCache", return_batches: bool = False):
-        self.cache = cache
-        self.return_batches = return_batches
-
-    def __iter__(self) -> Iterator[dict]:
-        for batch in self.cache:
-            encoding = dict_from_record_batch(batch)
-
-            if self.return_batches:
-                yield encoding
-            else:
-                batch_size = 0
-                for v in encoding.values():
-                    batch_size = len(v)
-                    break
-
-                for i in range(batch_size):
-                    yield {k: v[i] for k, v in encoding.items()}
-
-    def shard(self, shard_id: int, num_shards: int) -> "DictCacheDataset":
-        return DictCacheDataset(self.cache.shard(shard_id, num_shards))
-
-    @staticmethod
-    def load(cache_dir: str, return_batches: bool = False, batch_size: Optional[int] = None) -> "DictCacheDataset":
-        if batch_size is None:
-            batch_size = 1
-        cache = ShardCache.load(cache_dir, batch_size=batch_size)
-        return DictCacheDataset(cache, return_batches=return_batches)
-
-
-class ShardCache(Iterable[pa.RecordBatch]):
-    """A cache which is backed by a collection of chunks of preprocessed documents. These chunks
-    are produced by tokenizing/preprocessing a ShardedDataset.
-
-        This is the main interface for building and reading from a shard cache.
-
-        ShardCache has the following objectives:
-
-        1) Deterministic ordering over the data
-        2) Sharded reading
-        3) Sharded writing
-        4) Simultaneous reading and writing of shards
-        5) Fast resumption of writing
-        6) Fast resumption of reading
-
-        ShardCache achieves (1), (2), and (3) maintaining a reproducible global ordering over "chunks" created from shards.
-        The global ordering is defined by taking chunks round-robin from each shard. This allows us to read shards
-        in parallel and deterministically.
-
-        ShardCache achieves (4) also via the chunking mechanism. As soon as all shards have written a chunk, the next
-        chunk can be read. This allows us to read and write in parallel.
-
-        ShardCache achieves (5) by writing chunks to disk as soon as they are completed and serializing a state
-        of the chunks that have been written for each shard. This allows us to resume from the last chunk that was written.
-
-        # TODO (6) isn't implemented just yet
-
-        ShardCache achieves (6) by storing metadata about the chunks that have been written in a state. In addition
-        to the global ordering, the state also stores the number of documents in each chunk as well as the number
-        of tokens.
-    """
-
-    ledger: Optional[CacheLedger]
-    _broker: Optional[ActorHandle]
-    # We use a thread here instead of an actor because we want to ensure it's on the same process as the ShardCache
-    # object.
-    _monitor_thread: Optional[threading.Thread]
-    _metrics_monitors: List[MetricsMonitor]
-
-    def __init__(
-        self,
-        cache_dir: str,
-        batch_size: int,
-        ledger: Optional[CacheLedger],
-        _broker: Optional[ActorHandle],
-        reader_offset: int = 0,
-        num_readers: int = 1,
-    ):
-        self.cache_dir = cache_dir
-        self.ledger = ledger
-        self._broker = _broker
-        self._batch_size = batch_size
-
-        self._metrics_monitors = []
-        self._monitor_thread = None
-
-        self._num_readers = num_readers
-        self._reader_offset = reader_offset
-        name = os.path.join(*cache_dir.split("/")[-2:])
-        self.logger = pylogging.getLogger(f"ShardCache.{name}")
-
-    @staticmethod
-    def load(cache_dir: str, batch_size: int) -> "ShardCache":
-        """Loads a cache from disk. Raises FileNotFoundError if the cache doesn't exist"""
-        logger.info(f"Loading cache from {cache_dir}")
-        ledger = _load_cache_ledger(cache_dir)
-        return ShardCache(cache_dir, batch_size, ledger, None)
-
-    @staticmethod
-    def build_or_load(
-        cache_dir: str,
-        shard_source: ShardedDataset[T],
-        processor: BatchProcessor[T],
-        batch_size: int,
-        rows_per_chunk: int,
-        cache_config: Optional[Dict[str, Any]] = None,
-    ):
-        try:
-            return ShardCache.load(cache_dir, batch_size)
-        except FileNotFoundError:
-            broker = _get_broker_actor(
-                cache_dir=cache_dir,
-                input_shards=shard_source,
-                processor=processor,
-                cache_config=cache_config,
-                rows_per_chunk=rows_per_chunk,
-            )
-            return ShardCache(cache_dir=cache_dir, batch_size=batch_size, ledger=None, _broker=broker)
-
-    def finished_sentinel(self):
-        """Returns a Ray-awaitable object that will be set when the cache is finished"""
-        if self._broker is None:
-            return ray.remote(num_cpus=0)(lambda: None).remote()
-        else:
-            return self._broker.finished_sentinel.remote()
-
-    @property
-    def is_finished(self):
-        """Returns whether the cache is finished"""
-        if self._broker is None:
-            return True
-        else:
-            return ray.get(self._broker.is_finished.remote())
-
-    def read_chunk(self, chunk_idx: int) -> Iterator[pa.RecordBatch]:
-        """Reads a chunk from the cache"""
-        chunk = self.get_chunk(chunk_idx)
-        yield from self._read_chunk(chunk)
-
-    def _map_index(self, index):
-        return index * self._num_readers + self._reader_offset
-
-    def get_chunk(self, index: int, *, timeout: Optional[float] = None) -> ChunkMetadata:
-        """Returns the metadata for a given chunk index"""
-        mapped_index = self._map_index(index)
-        return self._get_chunk_unmapped(mapped_index, timeout=timeout)
-
-    def _get_chunk_unmapped(self, mapped_index: int, *, timeout: Optional[float] = None) -> ChunkMetadata:
-        if self.ledger is not None:
-            return self.ledger.chunks[mapped_index]
-        else:
-            assert self._broker is not None
-            time_in = time.time()
-            next_time = time_in
-            # we want to also log if we're waiting for a long time, so we do this in a loop
-            while timeout is None or next_time - time_in < timeout:
-                current_timeout = 20.0
-                if timeout is not None:
-                    current_timeout = min(current_timeout, timeout - (next_time - time_in))
-                try:
-                    chunk = ray.get(self._broker.get_chunk.remote(mapped_index), timeout=current_timeout)
-                except GetTimeoutError:
-                    self.logger.warning(f"Waiting for chunk {mapped_index} for {int(next_time - time_in)} seconds")
-                    next_time = time.time()
-                    current_timeout *= 2
-                    current_timeout = min(current_timeout, 100)
-                    continue
-                except asyncio.exceptions.InvalidStateError:
-                    self.logger.warning(
-                        f"Invalid state waiting for chunk {mapped_index} for {int(next_time - time_in)} seconds"
-                    )
-                    next_time = time.time()
-                    current_timeout *= 2
-                    current_timeout = min(current_timeout, 100)
-                    time.sleep(current_timeout)
-                    continue
-
-                if chunk is None:
-                    raise IndexError(f"Chunk index out of bounds. (Mapped index {mapped_index})")
-
-                return chunk
-
-            if timeout is not None:
-                raise TimeoutError(f"Timeout while waiting for chunk {mapped_index}")
-
-    async def get_chunk_async(self, index: int) -> ChunkMetadata:
-        """Returns the metadata for a given chunk index"""
-        mapped_index = self._map_index(index)
-        if self.ledger is not None:
-            return self.ledger.chunks[mapped_index]
-        else:
-            assert self._broker is not None
-            chunk = await self._broker.get_chunk.remote(mapped_index)
-            if chunk is None:
-                raise IndexError(f"Chunk index {index} out of bounds. (Mapped index {mapped_index})")
-            return chunk
-
-    def final_chunk_count(self) -> Optional[int]:
-        """Returns the number of chunks in the cache, if known"""
-        if self.ledger is not None:
-            return len(self.ledger.chunks)
-        else:
-            assert self._broker is not None
-            return ray.get(self._broker.final_chunk_count.remote())
-
-    def iter_batches_from_chunks(self, loop: bool = False):
-        shard_offset = self._reader_offset
-
-        if self.ledger is not None:
-            num_chunks = len(self.ledger.chunks)
-
-            if num_chunks == 0:
-                return
-
-            while True:
-                i = 0
-                for i in range(shard_offset, num_chunks, self._num_readers):
-                    chunk = self.ledger.chunks[i]
-                    yield from self._read_chunk(chunk)
-
-                if not loop:
-                    break
-
-                shard_offset = i % len(self.ledger.chunks)
-        else:
-            assert self._broker is not None
-            i = shard_offset
-            while True:
-                try:
-                    self.logger.debug(f"Reading chunk {i}")
-                    chunk = self._get_chunk_unmapped(i)
-                    i += self._num_readers
-                    yield from self._read_chunk(chunk)
-                except IndexError:
-                    if loop:
-                        num_chunks = ray.get(self._broker.final_chunk_count.remote())
-                        assert num_chunks is not None
-
-                        i = i % num_chunks
-                    else:
-                        break
-                except Exception as e:
-                    self.logger.exception("Error while reading from shard cache.")
-                    raise e
-
-    def __iter__(self):
-        return self.iter_batches_from_chunks()
-
-    def shard(self, offset, num_readers):
-        """
-        Returns a shard of this shard cache. This method shards w.r.t the current shard cache, not the base shard cache.
-
-        Args:
-            offset:
-            num_readers:
-
-        Returns:
-            (ShardCache): A shard of this shard cache.
-        """
-        if offset >= num_readers:
-            raise ValueError(f"Shard index {offset} is out of range")
-
-        if num_readers == 1:
-            return self
-
-        new_offset = self._reader_offset * num_readers + offset
-        new_num_readers = self._num_readers * num_readers
-        return ShardCache(self.cache_dir, self._batch_size, self.ledger, self._broker, new_offset, new_num_readers)
-
-    def unshard(self):
-        """
-        Gets the "base" shard cache that this shard cache is a shard of.
-        """
-        return ShardCache(self.cache_dir, self._batch_size, self.ledger, self._broker, 0, 1)
-
-    def with_batch_size(self, batch_size):
-        return ShardCache(
-            self.cache_dir, batch_size, self.ledger, self._broker, self._reader_offset, self._num_readers
-        )
-
-    def _read_chunk(self, chunk):
-        reader = _ChunkReader.from_metadata(self.cache_dir, chunk, self._batch_size)
-        for batch in reader:
-            yield batch
-
-    def await_finished(self, timeout: Optional[float] = None):
-        return ray.get(self.finished_sentinel(), timeout=timeout)
-
-    def attach_metrics_monitor(self, monitor: MetricsMonitor):
-        if self._broker is None:
-            # TODO: decide what to do about attaching if the cache is already finished
-            # maybe get the final metrics?
-            return
-
-        self._metrics_monitors.append(monitor)
-        if self._monitor_thread is None:
-            self._monitor_thread = threading.Thread(target=self._monitor_metrics)
-            self._monitor_thread.start()
-
-    def _monitor_metrics(self):
-        while True:
-            try:
-                metrics = ray.get(self._broker.updated_metrics.remote())
-                for monitor in self._metrics_monitors:
-                    monitor(metrics)
-                if metrics.is_finished:
-                    break
-            except Exception as e:
-                self.logger.exception("Error while reading metrics from shard cache.")
-                raise e
-
-
-class _ChunkReader:
-    """Reads batches of documents from a chunk"""
-
-    metadata: ChunkMetadata
-    file: pq.ParquetFile
-    batch_size: int
-
-    # TODO: seek by doc
-    # TODO: seek by token etc
-
-    def __init__(self, metadata: ChunkMetadata, file: pq.ParquetFile, batch_size: int):
-        self.metadata = metadata
-        self.file = file
-        self.batch_size = batch_size
-
-    def with_batch_size(self, batch_size):
-        return _ChunkReader(self.metadata, self.file, batch_size)
-
-    @property
-    def num_docs(self):
-        return self.metadata.num_rows
-
-    def field_count(self, field, default=None):
-        return self.metadata.field_counts.get(field, default)
-
-    @property
-    def __len__(self):
-        return (self.num_docs + self.batch_size - 1) // self.batch_size
-
-    def __iter__(self) -> Iterator[pa.RecordBatch]:
-        for record_batch in self.file.iter_batches(batch_size=self.batch_size):
-            yield record_batch
-
-    @staticmethod
-    def from_metadata(cache_dir, metadata: ChunkMetadata, batch_size: int) -> "_ChunkReader":
-        file = pq.ParquetFile(fsspec.open(os.path.join(cache_dir, f"{metadata.name}.parquet"), "rb").open())
-        return _ChunkReader(metadata, file, batch_size)
diff --git a/src/levanter/data/sharded_dataset.py b/src/levanter/data/sharded_datasource.py
similarity index 89%
rename from src/levanter/data/sharded_dataset.py
rename to src/levanter/data/sharded_datasource.py
index e16f5fce7..38682616d 100644
--- a/src/levanter/data/sharded_dataset.py
+++ b/src/levanter/data/sharded_datasource.py
@@ -2,7 +2,20 @@
 import json
 import os
 import warnings
-from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, List, Optional, Sequence, Sized, Tuple, TypeVar
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Generic,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Sized,
+    Tuple,
+    TypeVar,
+)
 
 import datasets
 import fsspec
@@ -10,6 +23,7 @@
 
 from levanter.utils import fsspec_utils
 
+from ..data import AsyncDataset
 from ._preprocessor import (
     BatchResult,
     _BatchMapTransform,
@@ -17,7 +31,6 @@
     _DatasetTransform,
     _MapTransform,
 )
-from .dataset import Dataset, ShardableDataset
 from .utils import batched
 
 
@@ -30,7 +43,7 @@
 U = TypeVar("U")
 
 
-class ShardedDataset(Dataset[T_co]):
+class ShardedDataSource(Generic[T_co]):
     """
     A ShardedDataset is the main interface for reading data. It's basically a mapping from shard names to iterators,
     with the extra feature that it exposes the ability to skip to a particular row in a shard.
@@ -66,10 +79,9 @@ def build_or_load_cache(
         self,
         path: str,
         *,
-        rows_per_chunk: Optional[int] = None,
         await_finished: bool = True,
         monitors: Optional[Sequence["MetricsMonitor"]] = None,
-    ) -> ShardableDataset[dict]:
+    ) -> AsyncDataset[T]:
         """
         Constructs a shard cache version of this dataset using Ray.
 
@@ -79,36 +91,30 @@ def build_or_load_cache(
         * interruptible and resumable
         * streaming results (no need to wait for everything to finish)
 
-        *Note that build_cache does not in general preserve the order of the data.*
-
         Note that this is an experimental API and is subject to change.
 
         Returns:
-            A new dataset that is backed by the cache.
+            A new AsyncDataset that is backed by the cache.
         """
-        from levanter.data.shard_cache import DEFAULT_ROWS_PER_CHUNK, DictCacheDataset, build_or_load_cache
-
-        if rows_per_chunk is None:
-            rows_per_chunk = DEFAULT_ROWS_PER_CHUNK
 
         source, processor = _construct_composite_batch_processor(self)
+        from ..store.cache import build_or_load_cache
 
         cache = build_or_load_cache(
             path,
             source,
             processor,
-            rows_per_chunk=rows_per_chunk,
             await_finished=await_finished,
             monitors=monitors,
         )
-        return DictCacheDataset(cache)
+        return cache
 
-    def map(self, fn: Callable[[T_co], U]) -> "ShardedDataset[U]":
-        return _MappedShardedDataset(self, fn)
+    def map(self, fn: Callable[[T_co], U]) -> "ShardedDataSource[U]":
+        return _MappedShardedDataSource(self, fn)
 
     def map_batches(
         self, fn: Callable[[list[T_co]], BatchResult], batch_size, *, num_cpus=1, num_gpus=0, **resources
-    ) -> "ShardedDataset[dict]":
+    ) -> "ShardedDataSource[dict]":
         """
         **Lazily** map a function over batches of data. This is useful for doing things like batching data for a model,
         or for batched preprocessing.
@@ -125,21 +131,25 @@ def map_batches(
         Returns:
             A new ShardedDataset.
         """
-        return _BatchMappedShardedDataset(self, fn, batch_size, num_cpus=num_cpus, num_gpus=num_gpus, **resources)
+        return _BatchMappedShardedDataSource(self, fn, batch_size, num_cpus=num_cpus, num_gpus=num_gpus, **resources)
 
 
-def dataset_from_hf(id: str, *, split, **kwargs) -> ShardedDataset[dict]:
+def datasource_from_hf(id: str, *, split, **kwargs) -> ShardedDataSource[dict]:
     """
     Create a ShardedDataset from a HuggingFace dataset. Arguments are passed to load_dataset.
     """
-    return WrappedHFDataset(id, split=split, **kwargs)
+    return WrappedHFDataSource(id, split=split, **kwargs)
+
+
+def datasource_from_jsonl(urls_or_paths: Sequence[str]) -> ShardedDataSource[dict]:
+    return JsonlDataSource(urls_or_paths)
 
 
-def dataset_from_jsonl(urls_or_paths: Sequence[str]) -> ShardedDataset[dict]:
-    return JsonlDataset(urls_or_paths)
+def datasource_from_json(urls_or_paths: Sequence[str]) -> ShardedDataSource[dict]:
+    return JsonDataSource(urls_or_paths)
 
 
-class WrappedHFDataset(ShardedDataset[dict]):
+class WrappedHFDataSource(ShardedDataSource[dict]):
     """
     This class is responsible for loading a dataset from HuggingFace Datasets and returning the shards.
     Only (some) IterableDatasets are actually sharded in any meaningful way, so we just return a single shard
@@ -189,7 +199,7 @@ def _load_dataset(self):
         return datasets.load_dataset(self.id, split=self.split, streaming=self.streaming, **self.kwargs)
 
 
-class TextUrlDataset(ShardedDataset[str]):
+class TextUrlDataSource(ShardedDataSource[str]):
     """
     Dataset for various text formats.
     """
@@ -232,7 +242,7 @@ def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[str]:
                     raise ValueError(f"Unknown format {format}")
 
 
-class AudioTextUrlDataset(ShardedDataset[Tuple[np.ndarray, int, str]]):
+class AudioTextUrlDataSource(ShardedDataSource[Tuple[np.ndarray, int, str]]):
     """
     Dataset for various audio and text formats.
     """
@@ -267,6 +277,8 @@ def _load_audio_file(file_name, sampling_rate):
                 audio = {"array": array, "sampling_rate": sr}
             elif "path" in audio_pointer:
                 audio = _load_audio_file(audio_pointer["path"], sampling_rate)
+            else:
+                raise ValueError(f"Unsupported audio format {audio_pointer}")
         elif isinstance(audio_pointer, str):
             # This supports filename pointers to arbitrary audio types
             audio = _load_audio_file(audio_pointer, sampling_rate)
@@ -287,14 +299,14 @@ def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[Tuple[np.ndar
                         if i >= row:
                             mat_json = json.loads(line)
                             audio_pointer = mat_json[self.audio_key]
-                            audio = AudioTextUrlDataset.resolve_audio_pointer(audio_pointer, self.sampling_rate)
+                            audio = AudioTextUrlDataSource.resolve_audio_pointer(audio_pointer, self.sampling_rate)
                             yield (audio["array"], audio["sampling_rate"], mat_json[self.text_key])
                         i += 1
                 case ".json":
                     data = json.load(f)
                     for doc in data[row:]:
                         audio_pointer = doc[self.audio_key]
-                        audio = AudioTextUrlDataset.resolve_audio_pointer(audio_pointer, self.sampling_rate)
+                        audio = AudioTextUrlDataSource.resolve_audio_pointer(audio_pointer, self.sampling_rate)
                         yield (audio["array"], audio["sampling_rate"], doc[self.text_key])
                 case _:
                     raise ValueError(f"Unknown format {format}")
@@ -348,7 +360,7 @@ def _sniff_format_for_dataset(url):
     return format_from_url
 
 
-class JsonlDataset(ShardedDataset[dict]):
+class JsonlDataSource(ShardedDataSource[dict]):
     def __init__(self, urls):
         self.urls = urls
         self._shard_name_to_url_mapping = _mk_shard_name_mapping(urls)
@@ -369,7 +381,7 @@ def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[dict]:
                 i += 1
 
 
-class TextDataset(ShardedDataset[dict]):
+class TextDataSource(ShardedDataSource[dict]):
     def __init__(self, urls):
         self.urls = urls
         self._shard_name_to_url_mapping = _mk_shard_name_mapping(urls)
@@ -388,7 +400,7 @@ def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[dict]:
                 i += 1
 
 
-class JsonDataset(ShardedDataset[dict]):
+class JsonDataSource(ShardedDataSource[dict]):
     def __init__(self, urls):
         self.urls = urls
         self._shard_name_to_url_mapping = _mk_shard_name_mapping(urls)
@@ -440,12 +452,12 @@ def _mk_shard_name_mapping(urls):
 
 
 class _TransformedDataset:
-    source: ShardedDataset
+    source: ShardedDataSource
     _transform: _DatasetTransform
 
 
-class _MappedShardedDataset(ShardedDataset[T], _TransformedDataset):
-    def __init__(self, source: ShardedDataset[T_co], fn: Callable[[T_co], T]):
+class _MappedShardedDataSource(ShardedDataSource[T], _TransformedDataset):
+    def __init__(self, source: ShardedDataSource[T_co], fn: Callable[[T_co], T]):
         self.source = source
         self.fn = fn
         self._transform = _MapTransform(fn)
@@ -458,10 +470,10 @@ def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[T]:
         return map(self.fn, self.source.open_shard_at_row(shard_name, row))
 
 
-class _BatchMappedShardedDataset(ShardedDataset[T], _TransformedDataset):
+class _BatchMappedShardedDataSource(ShardedDataSource[T], _TransformedDataset):
     def __init__(
         self,
-        source: ShardedDataset[T_co],
+        source: ShardedDataSource[T_co],
         fn: Callable[[list[T_co]], Iterable[U]],
         batch_size,
         num_cpus=1,
diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
index c29e55e83..fc9ce8052 100644
--- a/src/levanter/data/text.py
+++ b/src/levanter/data/text.py
@@ -1,4 +1,5 @@
 import abc
+import asyncio
 import copy
 import dataclasses
 import functools
@@ -7,7 +8,7 @@
 from dataclasses import dataclass
 from functools import cached_property
 from itertools import chain
-from typing import Dict, Iterator, List, Mapping, Optional, Sequence, Tuple, Union
+from typing import Dict, Iterator, List, Mapping, Optional, Sequence, Tuple, TypeVar, Union
 
 import braceexpand
 import datasets
@@ -15,20 +16,28 @@
 import fsspec
 import jax
 import numpy as np
-import pyarrow as pa
 import regex
+import tensorstore as ts
 from draccus import field
+from jax._src.random import PRNGKey
 from jaxtyping import PRNGKeyArray
+from tokenizers import normalizers
 
 import haliax as hax
 from haliax import Axis
 
+from levanter.data import AsyncDataset
+from levanter.data.dataset import MappedAsyncDataset
 from levanter.data.mixture import MixtureDataset, StopStrategy
+from levanter.data.permutation import EraConfig
 
 # intercept the logging nonsense here
 from levanter.logging import silence_transformer_nag  # noqa
 from levanter.models.attention import AttentionMask
 from levanter.models.lm_model import LmExample
+from levanter.store.cache import TreeCache
+from levanter.store.jagged_array import JaggedArrayStore
+from levanter.store.tree_store import TreeStore
 from levanter.utils.hf_utils import num_cpus_used_by_tokenizer
 
 
@@ -36,18 +45,16 @@
 from transformers import BatchEncoding, PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast  # noqa
 
 from levanter.compat.hf_checkpoints import load_tokenizer  # noqa
-from levanter.data._preprocessor import BatchProcessor, dict_from_record_batch  # noqa
-from levanter.data.dataset import ShardableDataset, ShuffleDataset  # noqa
+from levanter.data._preprocessor import BatchProcessor, U, dict_from_record_batch  # noqa
 from levanter.data.metrics_monitor import LoggerMetricsMonitor, LoggingMetricsMonitor, MetricsMonitor  # noqa
-from levanter.data.shard_cache import DEFAULT_ROWS_PER_CHUNK  # noqa
-from levanter.data.shard_cache import CacheLedger  # noqa
-from levanter.data.shard_cache import LEDGER_FILE_NAME as NEW_LEDGER_FILE_NAME  # noqa
-from levanter.data.shard_cache import ChunkMetadata, ShardCache, build_or_load_cache  # noqa
-from levanter.data.sharded_dataset import ShardedDataset, TextUrlDataset, WrappedHFDataset  # noqa
+from levanter.data.sharded_datasource import ShardedDataSource, TextUrlDataSource, WrappedHFDataSource  # noqa
 from levanter.shapes import NamedShapeSpec, ShapeSpec  # noqa
-from levanter.utils.jax_utils import use_cpu_device  # noqa
+from levanter.store.cache import build_or_load_cache  # noqa
+from levanter.utils.jax_utils import key_iterator, local_cpu_mesh, use_cpu_device  # noqa
 
 
+T_co = TypeVar("T_co", covariant=True)
+
 logger = logging.getLogger("levanter.data.text")
 
 # TASKS:
@@ -59,41 +66,110 @@
 DEFAULT_IGNORE_INDEX = -100  # Mirrors pytorch's default ignore index
 
 
-class CausalLmDataset(ShardableDataset[LmExample]):
+class TokenSeqDataset(AsyncDataset[np.ndarray]):
+    """
+    A dataset that yields sequences of tokens of fixed length from an underlying TreeCache.
+
+    :param doc_cache: the TreeCache to read from
+    :param seq_len: The max length of sequences to emit
+    """
+
+    def __init__(self, doc_cache: TreeCache[dict], seq_len: int):
+        self.doc_cache = doc_cache
+        self.seq_len = seq_len
+        self._store: Optional[TreeStore] = None
+        self._cached_len: Optional[int] = None
+
+    async def async_len(self) -> int:
+        await self.doc_cache.finished()
+        token_arrays = await self._await_token_cache()
+        return token_arrays.data_size // self.seq_len
+
+    async def _await_token_cache(self) -> JaggedArrayStore:
+        if self._store is None:
+            self._store = await self.doc_cache.store_async()
+        return self._store.tree["input_ids"]
+
+    async def final_length_is_known(self) -> bool:
+        return await self.doc_cache.final_length_is_known()
+
+    def is_finite(self) -> bool:
+        return True
+
+    async def current_len(self) -> Optional[int]:
+        store = await self._await_token_cache()
+        return store.data_size // self.seq_len
+
+    async def get_batch(self, indices: Sequence[int]) -> Sequence[T_co]:
+        token_arrays = await self._await_token_cache()
+        # logger.info(f"Time to get token cache: {time.time() - time_in}")
+        len = await self.wait_until_len_at_least(max(indices) + 1)
+        if len is not None and len < max(indices) + 1:
+            raise ValueError("Requested indices beyond the end of the dataset")
+        offsets = np.array(indices) * self.seq_len
+        with ts.Batch():
+            out = []
+            for offset in offsets:
+                out.append(token_arrays.data[offset : offset + self.seq_len].read())
+
+        out = await asyncio.gather(*out)
+
+        return out
+
+    def get_batch_sync(self, indices: Sequence[int]) -> Sequence[T_co]:
+        token_arrays = self.doc_cache.store.tree["input_ids"]
+        # logger.info(f"Time to get token cache: {time.time() - time_in}")
+        # len = await self.wait_until_len_at_least(max(indices) + 1)
+        # if len is not None and len < max(indices) + 1:
+        # raise ValueError("Requested indices beyond the end of the dataset")
+        offsets = np.array(indices) * self.seq_len
+        with ts.Batch():
+            out = []
+            for offset in offsets:
+                out.append(token_arrays.data[offset : offset + self.seq_len].read())
+        # logger.info(f"Time to read token cache: {time.time() - time_in}")
+
+        out = [x.result() for x in out]
+        # logger.info(f"Time to wait for token cache: {time.time() - time_in}")
+        return out
+
+    async def wait_until_len_at_least(self, length: int) -> int:
+        # length is brutally slow to compute, so we cache it
+        if self._cached_len is not None and self._cached_len >= length:
+            return self._cached_len
+
+        # TODO: would be better to listen for cache updates
+        length = await super().wait_until_len_at_least(length)
+        self._cached_len = length
+        return length
+
+
+class CausalLmDataset(MappedAsyncDataset[np.ndarray, LmExample]):
     def __init__(
         self,
-        dataset: ShardableDataset[np.ndarray],
+        dataset: AsyncDataset[np.ndarray],
         QPos: Axis,
         KPos: Axis,
         fcm_prob: float = 0.0,
-        key: Optional[PRNGKeyArray] = None,
+        key: Optional[PRNGKey] = None,
         ignore_index: Optional[int] = None,
     ):
         self.dataset = dataset
         self.QPos = QPos
         self.KPos = KPos
         self.fcm_prob = fcm_prob
-        self.key = key
         self.ignore_id = ignore_index
+        self.key = key
 
         if self.fcm_prob > 0.0 and self.key is None:
             raise ValueError("must provide key if fcm_prob > 0.0")
 
-    def shard(self, shard_id: int, num_shards: int) -> "CausalLmDataset":
-        return CausalLmDataset(
-            self.dataset.shard(shard_id, num_shards), self.QPos, self.KPos, self.fcm_prob, self.key, self.ignore_id
-        )
-
-    def __iter__(self) -> Iterator[LmExample]:
-        key = self.key
         sharding = jax.sharding.SingleDeviceSharding(jax.local_devices(backend="cpu")[0])
 
-        with use_cpu_device():
-
-            @functools.partial(eqx.filter_jit, out_shardings=sharding)
-            def _create_lm_example(tokens, key):
+        @functools.partial(eqx.filter_jit, out_shardings=sharding)
+        def _create_lm_example(tokens, key):
+            with local_cpu_mesh():
                 tokens = hax.named(tokens, self.QPos)
-
                 example = LmExample.causal(tokens=tokens, ignore_id=self.ignore_id)
 
                 if self.fcm_prob > 0:
@@ -109,216 +185,10 @@ def _create_lm_example(tokens, key):
 
                 return example
 
-            for tokens in self.dataset:
-                example = _create_lm_example(tokens, key)
-                yield example
+        super().__init__(self.dataset, _create_lm_example, key=key)
 
-
-class TokenSeqDataset(ShardableDataset[np.ndarray]):
-    """
-    A dataset that yields sequences of tokens of fixed length from a TokenizedDocumentCache.
-
-    :param doc_cache: the TokenizedDocumentCache to draw from
-    :param seq_len: The max length of sequences to emit
-    """
-
-    def __init__(self, doc_cache, seq_len: int, stride: Optional[int] = None):
-        self.doc_cache = doc_cache
-        self.seq_len = seq_len
-        self.stride = stride
-
-    def shard(self, shard_id: int, num_shards: int) -> "TokenSeqDataset":
-        """
-        Split the dataset into num_processes shards.
-        """
-        return TokenSeqDataset(self.doc_cache.shard(shard_id, num_shards), self.seq_len, self.stride)
-
-    def __iter__(self) -> Iterator[np.ndarray]:
-        extra_tokens = None  # BatchEncoding of the last tokens from the previous doc
-        for doc in self.doc_cache:
-            # TODO: we could be cleverer here, and avoid these expensive copies etc
-            # should run some benchmarks to see if it's worth it
-            if extra_tokens is not None:
-                doc = _stack_batch_encodings(extra_tokens, doc)
-                extra_tokens = None
-
-            for encoded_slice in concatenate_and_group_texts(doc, self.seq_len, self.stride, drop_remainder=False):
-                if len(encoded_slice["input_ids"]) < self.seq_len:
-                    assert extra_tokens is None
-                    extra_tokens = encoded_slice
-                else:
-                    extra_tokens = None
-                    ids = encoded_slice["input_ids"]
-                    yield ids
-
-    @staticmethod
-    def load(seq_len: int, cache_dir: str, stride: Optional[int] = None) -> "TokenSeqDataset":
-        # Maybe force the cache to be built ahead of time?
-        doc_cache = TokenizedDocumentCache.load(cache_dir, True)
-        return TokenSeqDataset(doc_cache, seq_len, stride)
-
-
-class BatchEncodingDataset(ShardableDataset[BatchEncoding]):
-    """
-    A Dataset that yields HF BatchEncodings from a ShardCache.
-    This basically yields a dict-of-arrays, just the HF BatchEncoding class version of dict.
-    """
-
-    def __init__(self, cache: ShardCache, return_batches: bool = False):
-        self.cache = cache
-        self.return_batches = return_batches
-
-    def __iter__(self) -> Iterator[BatchEncoding]:
-        for batch in self.cache:
-            encoding = _batch_encoding_from_record_batch(batch, flatten_docs=False)
-            if self.return_batches:
-                yield encoding
-            else:
-                batch_size = 0
-                for v in encoding.values():
-                    batch_size = len(v)
-                    break
-
-                for i in range(batch_size):
-                    # this doesn't work for reconstituted batches, so we have to do this
-                    # I have no idea why this is the case
-                    #     yield encoding[i]
-                    yield BatchEncoding({k: v[i] for k, v in encoding.items()})
-
-    def shard(self, shard_id: int, num_shards: int) -> "BatchEncodingDataset":
-        return BatchEncodingDataset(self.cache.shard(shard_id, num_shards))
-
-    @staticmethod
-    def load(cache_dir: str, return_batches: bool = False, batch_size: Optional[int] = None) -> "BatchEncodingDataset":
-        if batch_size is None:
-            batch_size = 1
-        cache = ShardCache.load(cache_dir, batch_size=batch_size)
-        return BatchEncodingDataset(cache, return_batches=return_batches)
-
-
-class TokenizedDocumentCache(ShardableDataset[BatchEncoding]):
-    """
-    Represents a tokenized document cache, which is a directory of parquet files with a ledger file.
-
-    The difference between this class and the TokenSeqDataset is that this class yields entire documents,
-    while the TokenSeqDataset yields tokens sequences of fixed length from concatenated documents.
-    """
-
-    def __init__(self, chunk_cache: ShardCache, flatten_docs):
-        self.chunk_cache = chunk_cache
-        self.flatten_docs = flatten_docs
-
-    def __iter__(self):
-        """Reads the cache files produced by cache_and_group and yields tokenized sequences.
-        If flatten is false, this returns the docs as they were presented to the caching process. If flatten is True,
-        then the documents returned are actually concatenated documents, where the number is the number of documents
-        presented as a batch to the caching process."""
-        for batch in self._chunks():
-            yield _batch_encoding_from_record_batch(batch, self.flatten_docs)
-
-    def _chunks(self):
-        return self.chunk_cache.iter_batches_from_chunks()
-
-    @staticmethod
-    def build_or_load(
-        cache_dir,
-        source: ShardedDataset[str],
-        tokenizer: PreTrainedTokenizerBase,
-        *,
-        flatten_docs=True,
-        enforce_bos=True,
-        enforce_eos=True,
-        batch_size=128,
-        rows_per_chunk=DEFAULT_ROWS_PER_CHUNK,
-        monitors=None,
-        await_finished=True,
-        override_resources=None,
-    ) -> "TokenizedDocumentCache":
-        bt = BatchTokenizer(
-            tokenizer, enforce_bos=enforce_bos, enforce_eos=enforce_eos, override_resources=override_resources
-        )
-        monitors = monitors or []
-        cache = build_or_load_cache(
-            cache_dir,
-            source,
-            bt,
-            await_finished=await_finished,
-            batch_size=batch_size,
-            rows_per_chunk=rows_per_chunk,
-            monitors=monitors,
-            cache_config={
-                "tokenizer": tokenizer.name_or_path,
-                "vocab_size": tokenizer.vocab_size,
-            },
-        )
-
-        if cache.is_finished:
-            logger.info(f"Cache {cache_dir} is complete.")
-        else:
-            logger.info(
-                f"Cache {cache_dir} is incomplete. This will block until at least one chunk per process is complete."
-            )
-
-        if cache.ledger and "tokenizer" in cache.ledger.metadata:
-            cached_tokenizer = cache.ledger.metadata["tokenizer"]
-            cached_vocab_size = cache.ledger.metadata["vocab_size"]
-            if cached_tokenizer != tokenizer.name_or_path:
-                raise ValueError(
-                    f"Cache {cache_dir} was built with tokenizer {cached_tokenizer}, but current tokenizer is"
-                    f" {tokenizer.name_or_path}."
-                )
-            if cached_vocab_size != tokenizer.vocab_size:
-                raise ValueError(
-                    f"Cache {cache_dir} was built with vocab size {cached_vocab_size}, but current vocab size is"
-                    f" {tokenizer.vocab_size}."
-                )
-
-        return TokenizedDocumentCache(cache, flatten_docs=flatten_docs)
-
-    @staticmethod
-    def load(cache_dir, batch_size: int = 128, flatten_docs=True):
-        """
-        Load a TokenizedDocumentCache from a directory. If the ledger file is not present, this will raise a
-        FileNotFoundError.
-
-        NOTE: ATM this attempts to migrate old caches to the new format, but this will be removed in the future.
-
-        :param cache_dir:
-        :param flatten_docs: If true, then multiple documents from a single batch (when the cache was built) will be
-        concatenated into a single document. Often one is concatenating documents anyway, so this is a useful option.
-        :return:
-        """
-
-        try:
-            cache = ShardCache.load(cache_dir, batch_size=batch_size)
-            return TokenizedDocumentCache(cache, flatten_docs=flatten_docs)
-        except FileNotFoundError:
-            raise FileNotFoundError(f"{cache_dir} is not a complete cache")
-        except Exception:
-            logger.exception("error loading cache")
-            raise
-
-    def shard(self, shard_index, num_shards):
-        if num_shards <= shard_index:
-            raise ValueError(f"Shard index {shard_index} is out of range")
-
-        if num_shards == 1:
-            return self
-
-        return TokenizedDocumentCache(self.chunk_cache.shard(shard_index, num_shards), self.flatten_docs)
-
-
-def _batch_encoding_from_record_batch(b: pa.RecordBatch, flatten_docs: bool):
-    if flatten_docs:
-        # insert a newaxis to the beginning so that it appears to be bs=1
-        return BatchEncoding(
-            {
-                b.field(i).name: b.column(i).values.to_numpy(zero_copy_only=False)[np.newaxis, :]
-                for i in range(b.num_columns)
-            },
-        )
-    else:
-        return BatchEncoding(dict_from_record_batch(b))
+    async def async_len(self) -> int:
+        return await self.dataset.async_len()
 
 
 def _maybe_force_tokenizer_parallelism(tokenizer: PreTrainedTokenizerBase):
@@ -328,10 +198,12 @@ def _maybe_force_tokenizer_parallelism(tokenizer: PreTrainedTokenizerBase):
         os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
 
+LONG_STRING_WORKAROUND = 10_000
+
 ws = regex.compile(r"\s")
 
 
-class BatchTokenizer(BatchProcessor[str]):
+class BatchTokenizer(BatchProcessor[str, dict]):
     """
     A batch processor that tokenizes a batch of strings using a tokenizer.
     By default, this will append eos to the end of the string, even if the tokenizer doesn't.
@@ -345,6 +217,7 @@ def __init__(
         *,
         batch_size=128,
         override_resources=None,
+        _workaround_len=LONG_STRING_WORKAROUND,
         return_attention_mask=False,
         padding=False,
         max_length=None,
@@ -380,20 +253,64 @@ def __init__(
 
         self._need_to_add_eos = should_append_eos
         self._need_to_add_bos = should_append_bos
+        self._workaround_len = _workaround_len
 
-    def __call__(self, batch: Sequence[str]) -> BatchEncoding:
+    def __call__(self, batch: Sequence[str]) -> list[dict]:
         if self._need_to_add_bos:
             batch = [self.tokenizer.bos_token + " " + d for d in batch]
 
         if self._need_to_add_eos:
             batch = [d + " " + self.tokenizer.eos_token for d in batch]
 
+        if self._needs_long_sequence_workaround:
+            batch, needs_merge = self._break_for_long_sequences(batch)
+        else:
+            needs_merge = []
+
         if self.padding is not False:
             encoding = self.tokenizer(batch, return_attention_mask=self.return_attention_mask, verbose=False, padding=self.padding, max_length=self.max_length, truncation=True)  # type: ignore
         else:
             encoding = self.tokenizer(batch, return_attention_mask=self.return_attention_mask, verbose=False)  # type: ignore
 
-        return encoding
+        if needs_merge:
+            new_encoding = self._merge_split_encodings(batch, encoding, needs_merge)
+            encoding = BatchEncoding(new_encoding)
+
+        # debatch the encoding
+        unbatched = [dict(zip(encoding, t)) for t in zip(*[encoding[k] for k in encoding])]
+
+        return unbatched
+
+    def _break_for_long_sequences(self, batch):
+        orig_lengths = [len(d) for d in batch]
+        # break any strings that are longer than LONG_STRING_WORKAROUND characters into smaller chunks
+        orig_batch = batch
+        batch = []
+        needs_merge = []
+        for i, d in enumerate(orig_batch):
+            needs_merge.append(False)
+            orig_len = orig_lengths[i]
+            while len(d) > self._workaround_len:
+                # we'd rather break strings at whitespace, so find the first whitespace
+                match = ws.search(d, self._workaround_len)
+                # this is vanishingly unlikely, but if we can't find a whitespace, just break it at the limit
+                if match is None:
+                    split = len(d)
+                else:
+                    split = match.start()
+
+                batch.append(d[:split])
+                needs_merge.append(True)
+
+                d = d[split:]
+                orig_len -= split
+
+            batch.append(d)
+        return batch, needs_merge
+
+    @property
+    def output_exemplar(self) -> dict:
+        return dict(**self.tokenizer("hi there", return_attention_mask=self.return_attention_mask, verbose=False))
 
     @property
     def name_or_path(self):
@@ -403,6 +320,59 @@ def name_or_path(self):
     def vocab_size(self):
         return self.tokenizer.vocab_size
 
+    @staticmethod
+    def _merge_split_encodings(batch, encoding, needs_merge):
+        # merge the encodings back together
+        # we might need to merge multiple encodings together
+        # needs merge marks the first n-1 encodings that need to be merged for each document
+        new_encoding = {}
+        for k, v in encoding.items():
+            if len(v) == 0:
+                continue
+            if isinstance(v[0], np.ndarray):
+                assert len(v) == len(batch)
+                v_out = []
+                vs_to_merge = []
+                for i in range(len(batch)):
+                    if not needs_merge[i]:
+                        v_out.append(np.concatenate(vs_to_merge))
+                        vs_to_merge = []
+                    vs_to_merge.append(v[i])
+
+                if len(vs_to_merge) > 0:
+                    v_out.append(np.concatenate(vs_to_merge))
+
+                new_encoding[k] = v_out
+            elif isinstance(v[0], list):
+                v_out = []
+                vs_to_merge = []
+                for i in range(len(batch)):
+                    if not needs_merge[i]:
+                        if len(vs_to_merge) > 0:
+                            v_out.append(list(chain(*vs_to_merge)))
+                        vs_to_merge = []
+                    vs_to_merge.append(v[i])
+
+                if len(vs_to_merge) > 0:
+                    v_out.append(list(chain(*vs_to_merge)))
+                new_encoding[k] = v_out
+            else:
+                raise ValueError(f"Unknown type {type(v[0])}")
+        return new_encoding
+
+    # TODO remove this when it's resolved https://github.com/huggingface/tokenizers/issues/1495
+    @cached_property
+    def _needs_long_sequence_workaround(self):
+        if isinstance(self.tokenizer, PreTrainedTokenizerFast):
+            normalizer = self.tokenizer.backend_tokenizer.normalizer
+            if normalizer is None:
+                return False
+            # if there's a "Replace" normalizer, then we need to do the workaround
+            # inexplicably there's no way to see inside a Sequence so we also have to assume it needs it
+            return isinstance(normalizer, (normalizers.Replace, normalizers.Sequence))
+        else:
+            return False
+
     @property
     def num_cpus(self) -> int:
         if self.override_resources is not None:
@@ -511,10 +481,10 @@ class LMDatasetSourceConfig:
     train_urls: List[str] = ()  # type: ignore
     validation_urls: List[str] = ()  # type:ignore
 
-    def get_shard_source(self, split) -> Optional[ShardedDataset[str]]:
+    def get_shard_source(self, split) -> Optional[ShardedDataSource[str]]:
         if self.id is not None:
             try:
-                ds = WrappedHFDataset(self.id, split=split, name=self.name, streaming=self.stream)
+                ds = WrappedHFDataSource(self.id, split=split, name=self.name, streaming=self.stream)
             except ValueError as e:
                 # if the message starts with Bad split, then just return None
                 if str(e).startswith("Bad split"):
@@ -531,7 +501,7 @@ def get_shard_source(self, split) -> Optional[ShardedDataset[str]]:
             split_urls = self.urls_for_split(split)
             if len(split_urls) == 0:
                 return None
-            return TextUrlDataset(split_urls, self.text_key)
+            return TextUrlDataSource(split_urls, self.text_key)
 
     def doc_iterator(self, split: str):
         if self.id is not None:
@@ -542,7 +512,7 @@ def doc_iterator(self, split: str):
         else:
             urls = self.urls_for_split(split)
 
-            yield from TextUrlDataset(urls, self.text_key)
+            yield from TextUrlDataSource(urls, self.text_key)
 
     def urls_for_split(self, split):
         if split == "train":
@@ -575,11 +545,13 @@ class LMTaskConfig(abc.ABC):
 
     # config related to caching
     cache_dir: str = "cache/"
-    rows_per_chunk: int = DEFAULT_ROWS_PER_CHUNK  # number of rows to process and cache per chunk
+    tokenizer_batch_size: int = 32
     enforce_eos: bool = True  # whether to append eos even if the tokenizer doesn't
 
     ignore_token_id: Optional[int] = None
-    shuffle_buffer_size: Optional[int] = None
+    shuffle: bool | EraConfig = False
+    """whether to shuffle the dataset. True means shuffle the whole dataset, False means don't shuffle.
+    If you want to shuffle in eras, provide an EraConfig (which asks for an era_length)"""
 
     @cached_property
     def the_tokenizer(self) -> PreTrainedTokenizerBase:
@@ -591,13 +563,13 @@ def the_tokenizer(self) -> PreTrainedTokenizerBase:
     @abc.abstractmethod
     def train_set(
         self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True, *, key: Optional[PRNGKeyArray]
-    ) -> ShardableDataset[np.ndarray]:
+    ) -> AsyncDataset[np.ndarray]:
         pass
 
     @abc.abstractmethod
     def validation_sets(
         self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True
-    ) -> Mapping[str, ShardableDataset[np.ndarray]]:
+    ) -> Mapping[str, AsyncDataset[np.ndarray]]:
         pass
 
     @property
@@ -607,7 +579,7 @@ def sources(self) -> dict[str, LMDatasetSourceConfig]:
 
     def tagged_eval_sets(
         self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True
-    ) -> list[Tuple[ShardableDataset[np.ndarray], List[str]]]:
+    ) -> list[Tuple[AsyncDataset[np.ndarray], List[str]]]:
         tags = {name: (config.tags or []) + [name] for name, config in self.sources.items()}
         eval_sets = self.validation_sets(seq_len, monitors)
 
@@ -620,17 +592,17 @@ class LMDatasetConfig(LMDatasetSourceConfig, LMTaskConfig):
 
     def train_set(
         self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True, *, key: Optional[PRNGKeyArray] = None
-    ) -> ShardableDataset[np.ndarray]:
+    ) -> AsyncDataset[np.ndarray]:
         ds = self.token_seq_dataset("train", seq_len, monitors)
         if ds is None:
             raise ValueError("No training set!")
 
-        if self.shuffle_buffer_size is not None:
-            if key is None:
-                key = jax.random.PRNGKey(0)
-            return ShuffleDataset(ds, key, self.shuffle_buffer_size)
+        if self.shuffle is True:
+            ds = ds.shuffle(key)
+        elif isinstance(self.shuffle, EraConfig):
+            ds = ds.era_shuffle(self.shuffle.era_length, key=key)
 
-        return ds
+        return ds  # type: ignore
 
     def validation_set(
         self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True
@@ -639,7 +611,7 @@ def validation_set(
 
     def validation_sets(
         self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True
-    ) -> Mapping[str, ShardableDataset[np.ndarray]]:
+    ) -> Mapping[str, AsyncDataset[np.ndarray]]:
         validation_set = self.validation_set(seq_len, monitors)
         if validation_set is not None:
             return {"": validation_set}
@@ -675,12 +647,12 @@ def token_seq_dataset(
 
     def build_or_load_cache(
         self, split: str, monitors: Union[bool, List[MetricsMonitor]] = True, logger_name: Optional[str] = None
-    ) -> Optional[TokenizedDocumentCache]:
+    ) -> Optional[TreeCache[BatchEncoding]]:
         split_cache_dir = os.path.join(self.cache_dir, split)
         name = logger_name or os.path.basename(self.cache_dir)
 
         try:
-            return TokenizedDocumentCache.load(split_cache_dir, flatten_docs=True)
+            return TreeCache.load(split_cache_dir, exemplar={"input_ids": np.zeros(0, dtype=np.int32)})
         except FileNotFoundError:
             pass
 
@@ -699,16 +671,20 @@ def build_or_load_cache(
         elif monitors is False:
             monitors = []
 
-        return TokenizedDocumentCache.build_or_load(
+        bt = BatchTokenizer(
+            self.the_tokenizer, enforce_bos=True, enforce_eos=self.enforce_eos, batch_size=self.tokenizer_batch_size
+        )
+
+        return build_or_load_cache(
             split_cache_dir,
             source,
-            self.the_tokenizer,
-            enforce_eos=self.enforce_eos,
-            flatten_docs=True,
-            rows_per_chunk=self.rows_per_chunk,
+            bt,
+            await_finished=False,
             monitors=monitors,
-            # TODO: it would be better if we could just prioritize validation higher (we typically want it after the first grad step)
-            await_finished=(split == "validation"),
+            cache_config={
+                "tokenizer": self.the_tokenizer.name_or_path,
+                "vocab_size": self.the_tokenizer.vocab_size,
+            },
         )
 
 
@@ -749,6 +725,8 @@ class LMMixtureDatasetConfig(LMTaskConfig):
     train_weights: Dict[str, float] = field(default_factory=dict)
     """ weights for each dataset source. They will be normalized to sum to 1. """
     stop_strategy: str = field(default=StopStrategy.RESTART_STRATEGY)
+    mixture_block_size: int = 2048
+    """ block size for the mixture dataset."""
 
     def __post_init__(self):
         if len(self.configs) == 0:
@@ -762,40 +740,59 @@ def __post_init__(self):
 
     def train_set(
         self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True, *, key: Optional[PRNGKeyArray]
-    ) -> ShardableDataset[np.ndarray]:
+    ) -> AsyncDataset[np.ndarray]:
         doc_caches = self.build_caches("train", monitors=monitors)
-        token_datasets = {name: TokenSeqDataset(cache, seq_len, stride=None) for name, cache in doc_caches.items()}
+        token_datasets = {name: TokenSeqDataset(cache, seq_len) for name, cache in doc_caches.items()}
+
         if key is None:
             key = jax.random.PRNGKey(0)
 
         mix_key, shuffle_key = jax.random.split(key)
 
+        # We shuffle the components and not the overall mixture because this lets us preserve
+        # the "stable batch" property of the mixture dataset.
+        def shuffle_ds(ds, key):
+            if self.shuffle is True:
+                ds = ds.shuffle(key)
+            elif isinstance(self.shuffle, EraConfig):
+                ds = ds.era_shuffle(self.shuffle.era_length, key=key)
+
+            return ds
+
+        if self.shuffle:
+            out_token_datasets = {}
+            key_iter = key_iterator(shuffle_key)
+            for name, ds in token_datasets.items():
+                out_token_datasets[name] = shuffle_ds(ds, next(key_iter))
+            token_datasets = out_token_datasets
+
         mixture = MixtureDataset(
-            datasets=token_datasets, weights=self.train_weights, stop_strategy=self.stop_strategy, key=mix_key
+            datasets=token_datasets,
+            weights=self.train_weights,
+            stop_strategy=self.stop_strategy,
+            key=mix_key,
+            block_size=2048,
         )
 
-        if self.shuffle_buffer_size is not None:
-            return ShuffleDataset(mixture, shuffle_key, self.shuffle_buffer_size)
-
         return mixture
 
     def training_sets(
         self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True
-    ) -> Mapping[str, ShardableDataset[np.ndarray]]:
+    ) -> Mapping[str, TokenSeqDataset]:
         doc_caches = self.build_caches("train", monitors=monitors)
-        token_datasets = {name: TokenSeqDataset(cache, seq_len, stride=None) for name, cache in doc_caches.items()}
+        token_datasets = {name: TokenSeqDataset(cache, seq_len) for name, cache in doc_caches.items()}
         return token_datasets
 
     def validation_sets(
         self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True
-    ) -> Mapping[str, ShardableDataset[np.ndarray]]:
+    ) -> Mapping[str, AsyncDataset[np.ndarray]]:
         doc_caches = self.build_caches("validation", monitors=monitors)
-        token_datasets = {name: TokenSeqDataset(cache, seq_len, stride=None) for name, cache in doc_caches.items()}
+        token_datasets = {name: TokenSeqDataset(cache, seq_len) for name, cache in doc_caches.items()}
         return token_datasets
 
     def build_caches(
         self, split: str, monitors: Union[bool, List[MetricsMonitor]] = True
-    ) -> Dict[str, TokenizedDocumentCache]:
+    ) -> Dict[str, TreeCache[dict]]:
         # this is a bit gross, but we want to forward all "Task" config fields to the LMDatasetConfig for building.
         # We do this by just grabbing all the fields from the LMDatasetConfig and forwarding them to the
         # LMDatasetConfig.build_or_load_cache method. We exclude the cache_dir field.
@@ -822,6 +819,13 @@ def build_caches(
                 logger.warning(f"Skipping {name} for split {split} because no source was provided")
             else:
                 caches[name] = cache
+
+        # in practice it works best if we block on validation caches
+        if split == "validation":
+            logger.info("Waiting for validation caches to finish building...")
+            for cache in caches.values():
+                cache.await_finished()
+
         return caches
 
     @property
diff --git a/src/levanter/doremi.py b/src/levanter/doremi.py
index 63495d709..9d048b24f 100644
--- a/src/levanter/doremi.py
+++ b/src/levanter/doremi.py
@@ -1,6 +1,6 @@
 import dataclasses
 import logging
-from typing import Iterator, Optional, Tuple, TypeVar
+from typing import Optional, Tuple, TypeVar
 
 import equinox as eqx
 import jax.numpy as jnp
@@ -14,7 +14,7 @@
 import levanter.tracker
 from levanter.callbacks import eval_loss_loop
 from levanter.checkpoint import load_checkpoint_or_initialize
-from levanter.data import ShardableDataset
+from levanter.data import AsyncDataset, MappedAsyncDataset
 from levanter.data.mixture import MixtureDataset
 from levanter.tracker import capture_time
 from levanter.trainer import M, StepInfo, Trainer, TrainerConfig, TrainerState
@@ -56,10 +56,10 @@ def estimate_mixture_weights(
     loss_fn: ComputeLossFunction[M, T],
     initial_proxy: M,
     ref: M,
-    data_sources: dict[str, ShardableDataset[T]],
+    data_sources: dict[str, AsyncDataset[T]],
     sampling_weights: Optional[dict[str, float]] = None,
     *,
-    validation_sets: Optional[dict[str, ShardableDataset[T]]] = None,
+    validation_sets: Optional[dict[str, AsyncDataset[T]]] = None,
     trainer_config: TrainerConfig = DEFAULT_DOREMI_TRAINER_CONFIG,
     optimizer: optax.GradientTransformation = optax.adamw(1e-3),
     domain_weight_step_size: float = 1.0,
@@ -107,7 +107,7 @@ def eval_loss(model, *batch, **batch_kwargs):
                 loss = eval_loss_loop(
                     eval_loss,
                     ref,
-                    trainer.replicated_loader(dataset, trainer.EvalBatch),
+                    trainer.data_loader(dataset, trainer.EvalBatch),
                     name=f"ref {domain}",
                     max_batches=trainer_config.max_eval_batches,
                 )
@@ -201,7 +201,7 @@ def doremi_step(state: DoremiState, ref, batch, domains):
             average_alpha=initial_alpha,
         )
         del initial_proxy
-        train_loader = iter(trainer.sharded_loader(tagged_mixture, trainer.TrainBatch))
+        train_loader = iter(trainer.data_loader(tagged_mixture, trainer.TrainBatch))
 
         if state.step > 0:
             # step is after the batch, so we need to seek to step
@@ -263,7 +263,7 @@ def _prepare_ref_model(ref, trainer):
 
 
 def domain_tagged_mixture(
-    data_sources: dict[str, ShardableDataset[T]],
+    data_sources: dict[str, AsyncDataset[T]],
     weights: dict[str, float],
     domain_to_index: dict[str, int],
     *,
@@ -278,13 +278,13 @@ def domain_tagged_mixture(
         for domain, domain_index in domain_to_index.items()
     }
 
-    return MixtureDataset(tagged_datasets, weights, key=key)
+    return MixtureDataset(tagged_datasets, weights, key=key, block_size=2048)
 
 
-class DomainTaggedDataset(ShardableDataset[Tuple[T, hax.NamedArray]]):  # named array is a scalar int
+class DomainTaggedDataset(MappedAsyncDataset[T, Tuple[T, hax.NamedArray]]):  # named array is a scalar int
     def __init__(
         self,
-        dataset: ShardableDataset[T],
+        dataset: AsyncDataset[T],
         domain_index: int | hax.NamedArray,
     ):
         self.dataset = dataset
@@ -294,9 +294,7 @@ def __init__(
         else:
             self.domain_index = domain_index
 
-    def shard(self, shard_id: int, num_shards: int) -> "DomainTaggedDataset[T]":
-        return DomainTaggedDataset(self.dataset.shard(shard_id, num_shards), self.domain_index)
+        def _transform(item):
+            return item, self.domain_index
 
-    def __iter__(self) -> Iterator[Tuple[T, hax.NamedArray]]:
-        for item in self.dataset:
-            yield item, self.domain_index
+        super().__init__(dataset, _transform)
diff --git a/src/levanter/eval.py b/src/levanter/eval.py
index 6a016f1f9..48fcb426c 100644
--- a/src/levanter/eval.py
+++ b/src/levanter/eval.py
@@ -1,7 +1,9 @@
+import asyncio
 import dataclasses
 import logging
 import warnings
-from typing import Callable, Optional, Sequence, TypeVar
+from collections import defaultdict
+from typing import Callable, Mapping, Optional, Sequence, TypeVar
 
 import jax.numpy as jnp
 import jmp
@@ -13,7 +15,7 @@
 from haliax.partitioning import ResourceMapping
 
 import levanter.tracker
-from levanter.data import Dataset, ReplicatedBatchLoader
+from levanter.data import AsyncDataset, DataLoader
 from levanter.logging import LoadingTimeTrackerIterator
 from levanter.models.lm_model import LmExample, LmHeadModel, compute_next_token_loss
 from levanter.trainer import StepInfo
@@ -37,15 +39,20 @@ class EvalResult:
     total_eval_loading_time: float
 
 
-class DomainTaggedDataset(Dataset[tuple[T, hax.NamedArray]]):
+# This class doesn't try to be async or work with incomplete datasets, because it's eval
+
+
+class DomainTaggedDataset(AsyncDataset[tuple[T, hax.NamedArray]]):
     """Holds multiple datasets, each with its own domain tag. Also indexes the tags to enable easier aggregation."""
 
+    tag_index: Mapping[str, int]
+
     @property
     def tags(self):
         return self.tag_to_index.keys()
 
     def __init__(
-        self, datasets: Sequence[tuple[Dataset[T], Sequence[str]]], max_examples_per_dataset: Optional[int] = None
+        self, datasets: Sequence[tuple[AsyncDataset[T], Sequence[str]]], max_examples_per_dataset: Optional[int] = None
     ):
         self.datasets = []
         tag_index: dict[str, int] = {}
@@ -62,20 +69,78 @@ def __init__(
         self.tag_to_index = tag_index
         self.Tag = hax.Axis("tag", len(self.tag_to_index))
         self.max_examples_per_dataset = max_examples_per_dataset
+        self._tag_arrays = self._compute_tag_arrays()
+        self._offsets: Optional[np.ndarray] = None
+        self._max_examples_per_dataset = max_examples_per_dataset
+
+    async def _get_offsets(self) -> np.ndarray:
+        if self._offsets is None:
+            lengths = await asyncio.gather(*[dataset.async_len() for dataset, _ in self.datasets])
+            if self._max_examples_per_dataset is not None:
+                lengths = [min(length, self._max_examples_per_dataset) for length in lengths]
+            self._offsets = np.cumsum([0] + lengths)
 
-    def __iter__(self):
+        return self._offsets  # type: ignore
+
+    def _compute_tag_arrays(self):
+        tag_arrays = []
         for dataset, tags in self.datasets:
             indexed = [self.tag_to_index[tag] for tag in tags]
             tags = np.zeros(self.Tag.size, dtype=np.int32)
             tags[indexed] = 1
             tags = hax.named(tags, self.Tag)
 
-            count = 0
-            for example in dataset:
-                if self.max_examples_per_dataset is not None and count >= self.max_examples_per_dataset:
-                    break
-                count += 1
-                yield example, tags
+            tag_arrays.append(tags)
+        return tag_arrays
+
+    async def async_len(self) -> int:
+        return int((await self._get_offsets())[-1])
+
+    async def getitem_async(self, item: int) -> tuple[T, hax.NamedArray]:
+        offsets = await self._get_offsets()
+        dataset_index = np.searchsorted(offsets, item, side="right") - 1
+        offset = offsets[dataset_index]
+        dataset, tags = self.datasets[dataset_index]
+        return await dataset.getitem_async(int(item - offset)), self._tag_arrays[dataset_index]
+
+    async def get_batch(self, indices: Sequence[int]) -> Sequence[tuple[T, hax.NamedArray]]:
+        # Chatgpt wrote this. pretty sure it's correct
+        offsets = await self._get_offsets()
+        original_order = np.argsort(indices)
+        sorted_indices = np.array(indices)[original_order]
+        dataset_indices = np.searchsorted(offsets, sorted_indices, side="right") - 1
+
+        # Group indices by the dataset they belong to
+        grouped_indices = defaultdict(list)
+        for idx, dataset_index in zip(sorted_indices, dataset_indices):
+            grouped_indices[dataset_index].append(idx - offsets[dataset_index])
+
+        # Retrieve the batch for each group
+        batch_futures: list = []
+        for dataset_index, dataset_indices in grouped_indices.items():
+            dataset, tags = self.datasets[dataset_index]
+            dataset_batch = dataset.get_batch(dataset_indices)
+            batch_futures.append(dataset_batch)
+
+        batch_groups = await asyncio.gather(*batch_futures)
+        batch = []
+        for dataset_index, dataset_batch in zip(grouped_indices.keys(), batch_groups):
+            batch.extend([(item, self._tag_arrays[dataset_index]) for item in dataset_batch])
+
+        # Reorder the batch to match the original order of indices
+        batch = [batch[i] for i in np.argsort(original_order)]
+
+        return batch
+
+    async def final_length_is_known(self) -> bool:
+        return all(await asyncio.gather(*[dataset.final_length_is_known() for dataset, _ in self.datasets]))
+
+    def is_finite(self) -> bool:
+        return all(dataset.is_finite() for dataset, _ in self.datasets)
+
+    async def current_len(self) -> Optional[int]:
+        # We currently require all datasets to be finished before we do anything with this dataset, so...
+        return await self.async_len()
 
 
 def _join_prefix(prefix: str, tag: str) -> str:
@@ -86,7 +151,7 @@ def _join_prefix(prefix: str, tag: str) -> str:
 
 def cb_tagged_lm_evaluate(
     EvalBatch: hax.Axis,
-    tagged_eval_sets: Sequence[tuple[Dataset[LmExample], Sequence[str]]],
+    tagged_eval_sets: Sequence[tuple[AsyncDataset[LmExample], Sequence[str]]],
     device_mesh: Optional[Mesh] = None,
     axis_mapping: ResourceMapping = None,
     max_examples_per_dataset: Optional[int] = None,
@@ -168,7 +233,7 @@ class TaggedEvaluator:
     def __init__(
         self,
         EvalBatch: hax.Axis,
-        tagged_eval_sets,
+        tagged_eval_sets: Sequence[tuple[AsyncDataset, Sequence[str]]],
         device_mesh=None,
         axis_mapping=None,
         max_examples_per_dataset=None,
@@ -176,8 +241,12 @@ def __init__(
     ):
         self.EvalBatch = EvalBatch
         self.dataset = DomainTaggedDataset(tagged_eval_sets, max_examples_per_dataset)
-        self.loader = ReplicatedBatchLoader(
-            self.dataset, mesh=device_mesh, axis_resources=axis_mapping, Batch=EvalBatch
+        self.loader = DataLoader(
+            EvalBatch,
+            self.dataset.as_async_dataset(),
+            max_buffered_batches=100,
+            mesh=device_mesh,
+            axis_resources=axis_mapping,
         )
         self.mp = mp
 
@@ -229,9 +298,11 @@ def evaluate(self, m: LmHeadModel):
         state = hax.shard(state)
 
         iterator = LoadingTimeTrackerIterator(self.loader)
+        n = 0
 
         for batch, tags in tqdm.tqdm(iterator, "eval"):
             state = self.accum_for_batch(m, state, batch, tags)
+            n += 1
 
         total_loss, losses_per_tag = state
 
diff --git a/src/levanter/logging.py b/src/levanter/logging.py
index 66e9e9581..1d1e1bd7f 100644
--- a/src/levanter/logging.py
+++ b/src/levanter/logging.py
@@ -75,11 +75,13 @@ def __init__(self, items: Iterable[T]):
         start = time.perf_counter()
         self.items = iter(items)
         self.total_time += time.perf_counter() - start
+        self.this_load_time = 0.0
 
     def __next__(self) -> T:
         start = time.perf_counter()
         item = next(self.items)
-        self.total_time += time.perf_counter() - start
+        self.this_load_time = time.perf_counter() - start
+        self.total_time += self.this_load_time
         return item
 
 
diff --git a/src/levanter/main/cache_dataset.py b/src/levanter/main/cache_dataset.py
index 5063c69e2..2483e9214 100644
--- a/src/levanter/main/cache_dataset.py
+++ b/src/levanter/main/cache_dataset.py
@@ -4,10 +4,10 @@
 
 import levanter
 from levanter.data.metrics_monitor import LoggingMetricsMonitor, RichMetricsMonitor
-from levanter.data.shard_cache import build_or_load_cache
 from levanter.data.text import BatchTokenizer, LMDatasetConfig
 from levanter.distributed import RayConfig
 from levanter.logging import init_logging
+from levanter.store.cache import build_or_load_cache
 from levanter.tracker import NoopConfig, TrackerConfig
 
 
@@ -46,10 +46,8 @@ def main(args: RayCachedLMDatasetConfig):
             cache_dir=split_cache_dir,
             input_shards=source,
             processor=batch_tokenizer,
-            rows_per_chunk=args.rows_per_chunk,
             await_finished=False,
             monitors=monitors,
-            batch_size=128,
         )
 
         cache.await_finished()
diff --git a/src/levanter/main/eval_lm.py b/src/levanter/main/eval_lm.py
index df41750ab..116a08f18 100644
--- a/src/levanter/main/eval_lm.py
+++ b/src/levanter/main/eval_lm.py
@@ -16,7 +16,7 @@
 from levanter import callbacks
 from levanter.checkpoint import load_checkpoint
 from levanter.compat.hf_checkpoints import HFCheckpointConverter, RepoRef
-from levanter.data import ReplicatedBatchLoader
+from levanter.data import DataLoader
 from levanter.data.text import CausalLmDataset, LMDatasetConfig
 from levanter.models.gpt2 import Gpt2Config
 from levanter.models.lm_model import LmConfig, LmExample, LmHeadModel, compute_next_token_loss
@@ -57,7 +57,9 @@ def main(config: EvalLmConfig):
 
         raw_dataset = CausalLmDataset(validation_set, Pos, KeyPos)  # type: ignore
 
-    eval_loader = ReplicatedBatchLoader(raw_dataset, config.trainer.device_mesh, Batch)
+    eval_loader = DataLoader(
+        Batch, raw_dataset, None, config.trainer.device_mesh, config.trainer.parameter_axis_mapping
+    )
     compute_axis_mapping = config.trainer.compute_axis_mapping
     parameter_axis_mapping = config.trainer.parameter_axis_mapping
 
diff --git a/src/levanter/main/lora_lm.py b/src/levanter/main/lora_lm.py
index 9d7018c7e..9eee109fe 100644
--- a/src/levanter/main/lora_lm.py
+++ b/src/levanter/main/lora_lm.py
@@ -22,7 +22,6 @@
 from levanter.optim import AdamConfig, OptimizerConfig
 from levanter.trainer import Trainer, TrainerConfig
 from levanter.utils.jax_utils import parameter_count
-from levanter.utils.py_utils import non_caching_cycle
 
 
 logger = logging.getLogger(__name__)
@@ -87,7 +86,7 @@ def main(config: LoraLmConfig):
             logger.warning("No evaluation datasets provided.")
 
         train_dataset = CausalLmDataset(config.data.train_set(Pos.size, key=data_key), Pos, KeyPos)
-        train_loader = trainer.sharded_loader(train_dataset, Batch)
+        train_loader = trainer.data_loader(train_dataset, Batch)
 
         # load the underlying hf model
         logger.info(f"Loading pretrained model from {converter.reference_checkpoint}")
@@ -150,16 +149,7 @@ def loraize_hf_model(model):
                 every=config.hf_save_steps,
             )
 
-        # data loader. may need to seek to the right place if we're resuming
-        iter_data = non_caching_cycle(train_loader)
-
-        if int(state.step) > 0:
-            # step is after the batch, so we need to seek to step
-            # TODO: implement iter_data.seek(resume_step +1)
-            import tqdm
-
-            for _ in tqdm.tqdm(range(state.step), desc="seeking data for resume"):
-                next(iter_data)
+        iter_data = train_loader.iter_from_step(state.step)
 
         ## OK, actually run training!
         trainer.train(state, iter_data)
diff --git a/src/levanter/main/train_asr.py b/src/levanter/main/train_asr.py
index 2d0651198..72e6d5adb 100644
--- a/src/levanter/main/train_asr.py
+++ b/src/levanter/main/train_asr.py
@@ -113,7 +113,7 @@ def compute_loss(
         Pos = config.model.Pos
         KeyPos = config.model.KeyPos
 
-        eval_datasets = config.data.validation_sets(config.batch_size)
+        eval_datasets = config.data.validation_sets()
         train_dataset = AudioTextDataset(
             config.data.train_set(config.batch_size),
             Pos,
@@ -189,16 +189,7 @@ def compute_log_probs(model, example):
             logprobs = hax.roll(logprobs, 1, Pos)
             return logprobs.rearrange((EvalBatch, Pos)).array
 
-        # data loader. may need to seek to the right place if we're resuming
-        train_loader = iter(trainer.sharded_loader(train_dataset, Batch))
-
-        if int(state.step) > 0:
-            # step is after the batch, so we need to seek to step
-            # TODO: implement iter_data.seek(resume_step +1)
-            import tqdm
-
-            for _ in tqdm.tqdm(range(state.step), desc="seeking data for resume"):
-                next(train_loader)
+        train_loader = trainer.data_loader(train_dataset, Batch).iter_from_step(state.step)
 
         ## OK, actually run training!
         trainer.train(state, train_loader)
diff --git a/src/levanter/main/train_lm.py b/src/levanter/main/train_lm.py
index e76f6bc5d..8e905b064 100644
--- a/src/levanter/main/train_lm.py
+++ b/src/levanter/main/train_lm.py
@@ -114,7 +114,8 @@ def main(config: TrainLmConfig):
         Pos = config.model.Pos
         KeyPos = config.model.KeyPos
 
-        tagged_eval_datasets = config.data.tagged_eval_sets(Pos.size)
+        # TODO: fix this
+        tagged_eval_datasets: list = config.data.tagged_eval_sets(Pos.size)
         train_dataset = CausalLmDataset(
             config.data.train_set(Pos.size, key=data_key), Pos, KeyPos, ignore_index=config.data.ignore_token_id
         )
@@ -161,13 +162,14 @@ def main(config: TrainLmConfig):
         if len(tagged_eval_datasets) == 0:
             logger.warning("No evaluation datasets provided.")
         else:
+            max_eval_examples_per_ds = config.trainer.max_eval_batches
+            if max_eval_examples_per_ds is not None:
+                max_eval_examples_per_ds *= config.trainer.eval_batch_size
+
             causal_datasets = [
                 (CausalLmDataset(ds, Pos, KeyPos, ignore_index=config.data.ignore_token_id), tags)
                 for ds, tags in tagged_eval_datasets
             ]
-            max_eval_examples_per_ds = config.trainer.max_eval_batches
-            if max_eval_examples_per_ds is not None:
-                max_eval_examples_per_ds *= config.trainer.eval_batch_size
 
             cb = levanter.eval.cb_tagged_lm_evaluate(
                 EvalBatch,
@@ -205,23 +207,11 @@ def compute_log_probs(model, example):
             logprobs = hax.roll(logprobs, 1, Pos)
             return logprobs.rearrange((EvalBatch, Pos)).array
 
-        # engine.add_hook(
-        #     callbacks.compute_and_visualize_log_probs(
-        #         eval_loader, tokenizer, compute_log_probs, os.path.join(config.trainer.run_dir, "log_probs")
-        #     ),
-        #     every=config.trainer.steps_per_eval,
-        # )
-        #
-        # data loader. may need to seek to the right place if we're resuming
-        train_loader = iter(trainer.sharded_loader(train_dataset, Batch))
-
-        if int(state.step) > 0 and seek_dataloader:
-            # step is after the batch, so we need to seek to step
-            # TODO: implement iter_data.seek(resume_step +1)
-            import tqdm
-
-            for _ in tqdm.tqdm(range(state.step), desc="seeking data for resume"):
-                next(train_loader)
+        train_loader = trainer.data_loader(train_dataset, Batch)
+        if seek_dataloader:
+            train_loader = train_loader.iter_from_step(state.step)
+        else:
+            train_loader = iter(train_loader)
 
         ## OK, actually run training!
         trainer.train(state, train_loader)
diff --git a/src/levanter/main/viz_logprobs.py b/src/levanter/main/viz_logprobs.py
index a95783c18..b00ba61d5 100644
--- a/src/levanter/main/viz_logprobs.py
+++ b/src/levanter/main/viz_logprobs.py
@@ -11,7 +11,7 @@
 
 import levanter
 from levanter.checkpoint import load_checkpoint
-from levanter.data import ReplicatedBatchLoader
+from levanter.data import DataLoader
 from levanter.data.text import CausalLmDataset, LMDatasetConfig
 from levanter.models.gpt2 import Gpt2Config
 from levanter.models.lm_model import LmConfig, LmExample, LmHeadModel, compute_next_token_loss
@@ -44,10 +44,12 @@ def main(config: VizGpt2Config):
     Pos = config.model.Pos
     KeyPos = config.model.KeyPos
 
-    eval_loader = ReplicatedBatchLoader(
+    eval_loader = DataLoader(
+        EvalBatch,
         CausalLmDataset(config.data.validation_set(Pos.size), Pos, KeyPos),  # type: ignore
+        32,
         config.trainer.device_mesh,
-        EvalBatch,
+        config.trainer.compute_axis_mapping,
     )
 
     # some axes we use outside the model proper
diff --git a/src/levanter/store/__init__.py b/src/levanter/store/__init__.py
new file mode 100644
index 000000000..d0f4ad96a
--- /dev/null
+++ b/src/levanter/store/__init__.py
@@ -0,0 +1,6 @@
+from .cache import SerialCacheWriter, TreeCache, build_or_load_cache
+from .jagged_array import JaggedArrayStore
+from .tree_store import TreeStore
+
+
+__all__ = ["TreeCache", "build_or_load_cache", "SerialCacheWriter", "JaggedArrayStore", "TreeStore"]
diff --git a/src/levanter/store/cache.py b/src/levanter/store/cache.py
new file mode 100644
index 000000000..85b612f91
--- /dev/null
+++ b/src/levanter/store/cache.py
@@ -0,0 +1,1321 @@
+import asyncio
+import concurrent
+import dataclasses
+import heapq
+import logging as pylogging
+import os
+import threading
+import time
+from asyncio import InvalidStateError
+from concurrent.futures import Future as threading_Future
+from contextlib import AbstractContextManager
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Generic, Iterator, List, Optional, Sequence, TypeVar, Union
+
+import fsspec.core
+import pyarrow as pa
+import ray
+from dataclasses_json import dataclass_json
+from fsspec import AbstractFileSystem
+from ray.actor import ActorHandle
+
+from levanter.data.dataset import AsyncDataset
+
+from ..data._preprocessor import BatchProcessor, BatchResult, dict_from_record_batch
+from ..data._queue import (
+    PriorityWorkItem,
+    PriorityWorkTaskGroup,
+    PriorityWorkTaskGroupSpec,
+    WorkQueueDispatcherActor,
+    _BatchProcessorQueue,
+)
+from ..data.metrics_monitor import InProgressCacheMetrics, LoggerMetricsMonitor, MetricsMonitor
+from ..data.sharded_datasource import ShardedDataSource
+from ..utils.ray_utils import (
+    ExceptionInfo,
+    RefBox,
+    SnitchRecipient,
+    current_actor_handle,
+    log_failures_to,
+    ser_exc_info,
+)
+from .tree_store import TreeStore
+
+
+T = TypeVar("T")
+U = TypeVar("U")
+T_co = TypeVar("T_co", covariant=True)
+
+logger = pylogging.getLogger(__name__)
+
+LEDGER_FILE_NAME = "shard_ledger.json"
+
+DEFAULT_LOG_LEVEL = pylogging.INFO
+LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+
+# TODO: should probably do this in terms of bytes
+MIN_ITEMS_TO_WRITE = 8192
+MAX_TIME_BETWEEN_WRITES = 100.0
+
+
+def build_or_load_cache(
+    cache_dir: str,
+    input_shards: ShardedDataSource[T],
+    processor: BatchProcessor[T, U],
+    await_finished: bool = True,
+    monitors: Optional[Sequence["MetricsMonitor"]] = None,
+    cache_config: Optional[Dict[str, Any]] = None,
+    items_per_write: int = MIN_ITEMS_TO_WRITE,
+) -> "TreeCache[U]":
+    """
+    Produces a sharded cache of the dataset using Ray for distributed processing. The cache can be any path
+    on any file system understood by fsspec.
+
+    This system is designed with tokenization and similar processes in mind, but it can potentially be used for any kind
+    of preprocessing that converts input batches to output batches. The main design goal is to make it easy to
+    parallelize preprocessing across multiple machines while maintaining reproducibility and fault tolerance.
+    Usually the machines in question are the ones doing the training, but they could be separate machines as well.
+
+    See the [Dataloader Design Doc](https://github.com/stanford-crfm/levanter/blob/main/docs/design/Data-Loader-Design.md)
+    for a somewhat out of date overview of the design.
+
+    Args:
+        cache_dir: The directory to write the cache to. This can be any path understood by fsspec.
+        input_shards: A ShardedDataset that will be used to read the input data. Conceptually, it's just a mapping
+                    from shard names to iterators over the data in that shard.
+        processor: A BatchProcessor that will be used to process batches of data. This is the main place where
+                    you can customize the preprocessing pipeline.
+        await_finished: If True, this function will block until the cache is finished. If False, it will return
+                    immediately.
+        monitors: a list of MetricsMonitors to attach to the cache. These will be called periodically with
+            metrics about the cache build process. If None, will add a LoggerMetricsMonitor.
+
+        cache_config: A dictionary of configuration options for the cache. This is passed to the cache writer.
+
+        items_per_write: The number of items to write to the cache at a time. This is a performance tuning parameter,
+            and you probably don't need to change it. We mostly use it for testing.
+
+    Returns:
+       (TreeCache) A TreeCache object that can be used to read the cache.
+
+    """
+    # first see if we need to do anything
+    cache = TreeCache.build_or_load(
+        cache_dir=cache_dir,
+        shard_source=input_shards,
+        processor=processor,
+        cache_config=cache_config,
+        items_per_write=items_per_write,
+    )
+
+    if cache.is_finished:
+        logger.info("Cache already finished. Skipping.")
+        return cache
+
+    if monitors is None:
+        monitors = [LoggerMetricsMonitor()]
+
+    for monitor in monitors:
+        cache.attach_metrics_monitor(monitor)
+
+    while await_finished:
+        try:
+            cache.await_finished(4.0)
+            break
+        except TimeoutError:
+            pass
+
+    return cache
+
+
+@dataclass_json
+@dataclass
+class CacheLedger:
+    # NB: unlike the old cache, the mere existence of a ledger doesn't mean the cache is finished
+    total_num_rows: int
+    shard_rows: Dict[str, int]
+    is_finished: bool = False
+    finished_shards: List[str] = dataclasses.field(default_factory=list)
+    field_counts: Dict[str, int] = dataclasses.field(default_factory=dict)
+    metadata: Dict[str, Any] = dataclasses.field(default_factory=dict)
+
+
+@dataclass
+class ShardStatus:
+    shard_name: str
+    num_rows_committed: int
+    is_finished: bool
+
+
+class SerialCacheWriter(AbstractContextManager):
+    """
+    Writes TreeCache-compatible caches to disk. This is a serial version of TreeCacheWriter that doesn't use Ray.
+    Mostly for scripts and debugging.
+
+    Examples:
+        >>> with SerialCacheWriter(cache_dir, exemplar) as writer:
+        ...     for batch in process_batches():
+        ...         writer.write_batch(batch)
+    """
+
+    def __init__(
+        self,
+        cache_dir: str,
+        exemplar: T,
+        cache_config: Optional[Dict[str, Any]] = None,
+    ):
+        self.cache_dir = cache_dir
+        self.cache_config = cache_config
+        self._exemplar = exemplar
+        self._tree_store = TreeStore.open(exemplar, self.cache_dir, mode="w")  # type: ignore
+        self._is_closed = False
+
+    def __enter__(self) -> "SerialCacheWriter":
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        # if successful, write the ledger
+        # TODO: store field counts in the ledger
+        ledger = CacheLedger(
+            total_num_rows=len(self._tree_store),
+            is_finished=True,
+            shard_rows={"": len(self._tree_store)},
+            finished_shards=[""],
+            field_counts={},
+        )
+
+        if exc_type is None:
+            _serialize_json_and_commit(os.path.join(self.cache_dir, LEDGER_FILE_NAME), ledger)
+            logger.info(f"Cache ledger written to {self.cache_dir}")
+            self._is_closed = True
+
+    def result(self) -> "TreeCache":
+        if not self._is_closed:
+            raise RuntimeError("Cannot get result until TreeCacheWriter is closed")
+        return TreeCache.load(self.cache_dir, self._exemplar)
+
+    def write_batch(self, batch: BatchResult):
+        if isinstance(batch, pa.RecordBatch):
+            raise NotImplementedError("Only non-RecordBatch batches are supported for now")
+
+        batch = _canonicalize_batch(batch)  # type: ignore
+
+        self._tree_store.extend(batch)
+
+
+def _load_or_initialize_ledger(path):
+    try:
+        with fsspec.open(path, "r") as file:
+            return CacheLedger.from_json(file.read())
+    except FileNotFoundError:
+        return CacheLedger(0, {})
+
+
+@ray.remote(num_cpus=0.5)  # type: ignore
+class _OrderedCacheWriter:
+    """
+    This cache writer receives examples from some number of shards (generally out of order) and writes them to the store
+    in a defined round-robin order. It also keeps track of the metadata for each shard.
+
+    Once a shard finishes sending batches, it notifies this writer, which then updates the metadata and writes it to disk.
+    """
+
+    def __init__(
+        self,
+        parent,
+        name,
+        exemplar,
+        batch_size,
+        cache_dir: str,
+        shards: Sequence[str],
+        min_items_to_write=MIN_ITEMS_TO_WRITE,
+    ):
+        pylogging.basicConfig(level=DEFAULT_LOG_LEVEL, format=LOG_FORMAT)
+        with log_failures_to(parent):
+            self._parent = parent
+            self.cache_dir = cache_dir
+            self.shards = shards
+            self.batch_size = batch_size
+            self._min_items_to_write = min_items_to_write
+            self._failed = False
+            self._logger = pylogging.getLogger(name)
+
+            # these are batches that we've received but haven't ordered them for writing yet
+            self._batch_queue = GroupRoundRobinBuffer(shards)  # type: ignore
+            self._total_queue_length = 0
+            self._was_overwhelmed = False  # whether the queue has gotten too big
+            # writes are very slow (~2s) so we want to batch them up
+            self._ordered_but_unwritten_items: list = []
+            self._batches_in_next_write_by_shard: dict[str, int] = {shard: 0 for shard in shards}
+            # we also want to write every so often
+            self._last_write_time = time.time()
+
+            self._ledger = _load_or_initialize_ledger(os.path.join(cache_dir, LEDGER_FILE_NAME))
+            self._expected_num_rows: dict[str, Optional[int]] = {shard: None for shard in shards}
+
+            self._tree_store = TreeStore.open(exemplar, self.cache_dir, mode="a")
+            # careful: trim the store to the total number of rows in the cache that we've committed to
+            self._tree_store.trim_to_size(self._ledger.total_num_rows)
+            # we also have to tell the queue how many rows for each shard we've already written
+            for shard, num_rows in self._ledger.shard_rows.items():
+                if num_rows > 0:
+                    self._logger.info(f"Already written {num_rows} rows for shard {shard}")
+
+                # careful: this is in terms of batch size
+                # Have to round up to the nearest batch size
+                self._batch_queue.fast_forward(shard, div_round_up(num_rows, self.batch_size))
+                if shard in self._ledger.finished_shards:
+                    self._expected_num_rows[shard] = num_rows
+                    self._batch_queue.group_total_known(shard, div_round_up(num_rows, self.batch_size))
+
+            # double check that we're not finished by committing the ledger
+            self._attempt_to_write_batches()
+
+    def batch_finished(self, shard_name: str, shard_batch_idx: int, batch_result_box):
+        with log_failures_to(self._parent):
+            if self._failed:
+                self._logger.warning("Received batch after failure. Ignoring.")
+                return
+
+            if isinstance(batch_result_box, RefBox):
+                batch_result = ray.get(batch_result_box.ref)
+            else:
+                batch_result = batch_result_box
+
+            # we need to keep track of the order of the batches so that we can write them out in order
+            self._total_queue_length += len(batch_result)
+            self._batch_queue.append_to_group(shard_name, shard_batch_idx, batch_result)
+            self._attempt_to_write_batches()
+            next_missing_item = self._batch_queue.next_missing_item_index()
+
+            overwhelmed = self.is_overwhelmed()
+            if overwhelmed:
+                if not self._was_overwhelmed:
+                    self._logger.warning(f"Writer queue is getting long ({self._total_queue_length}).")
+                self._parent.signal_backpressure.remote(next_missing_item)
+            elif self._was_overwhelmed:
+                self._logger.info(f"Writer queue is no longer overwhelmed ({self._total_queue_length}).")
+                self._parent.signal_backpressure.remote(None)
+
+            self._was_overwhelmed = overwhelmed
+
+    def shard_failed(self, shard_name: str, batch_id: int, exc_info: ExceptionInfo):
+        with log_failures_to(self._parent):
+            self._failed = True
+            logger.error(f"Shard {shard_name} failed at batch {batch_id}", exc_info=exc_info.restore())
+            self._parent.shard_failed.remote(shard_name, exc_info)
+
+    def shard_finished_reading(self, shard_name: str, expected_num_rows: int):
+        with log_failures_to(self._parent):
+            # careful: this is in terms of batch size
+            self._batch_queue.group_total_known(shard_name, div_round_up(expected_num_rows, self.batch_size))
+            self._expected_num_rows[shard_name] = expected_num_rows
+            logger.debug(
+                f"Attempting to write batches because {shard_name} finished reading with {expected_num_rows} batches."
+            )
+            self._attempt_to_write_batches()
+
+    def get_shard_status(self, shard_name: str):
+        with log_failures_to(self._parent):
+            rows = self._ledger.shard_rows.get(shard_name, 0)
+            is_finished = shard_name in self._ledger.finished_shards
+            return ShardStatus(shard_name, rows, is_finished)
+
+    def get_ledger(self):
+        return self._ledger
+
+    def _attempt_to_write_batches(self):
+        if self._ledger.is_finished:
+            raise RuntimeError("Trying to write batches after cache is finished")
+
+        if self._failed:
+            logger.warning("Not writing batches because of failure.")
+            return
+
+        self._dequeue_ready_batches()
+        updated_shards = self._write_available_batches()
+
+        logger.debug(f"Updated shards: {updated_shards}")
+
+        need_to_commit = len(updated_shards) > 0
+        total_rows = self._ledger.total_num_rows + sum(updated_shards.values())
+
+        for shard, num_rows in updated_shards.items():
+            self._ledger.shard_rows[shard] = self._ledger.shard_rows.get(shard, 0) + num_rows
+
+        futures_to_await_shards, need_to_commit_for_shards = self._check_for_finished_shards()
+
+        need_to_commit = need_to_commit or need_to_commit_for_shards
+
+        futures_to_await = []
+        if need_to_commit:
+            self._ledger.total_num_rows = total_rows
+            _serialize_json_and_commit(os.path.join(self.cache_dir, LEDGER_FILE_NAME), self._ledger)
+
+            futures_to_await.append(self._parent._updated_ledger.remote(self._ledger))
+
+            if self._ledger.is_finished:
+                f = self._parent._finalize.remote()
+                futures_to_await.append(f)
+
+        ray.wait(futures_to_await + futures_to_await_shards)
+
+    def _dequeue_ready_batches(self):
+        for shard, batch in self._batch_queue.drain():
+            logger.debug(f"Writing batch for {shard}")
+            batch = _canonicalize_batch(batch)
+            self._total_queue_length -= len(batch)
+            self._ordered_but_unwritten_items.extend(batch)
+            self._batches_in_next_write_by_shard[shard] = self._batches_in_next_write_by_shard.get(shard, 0) + len(
+                batch
+            )
+
+    def _write_available_batches(self):
+        if len(self._ordered_but_unwritten_items) == 0:
+            return {}
+
+        any_shard_finished_reading = any(num_rows is not None for num_rows in self._expected_num_rows.values())
+
+        if (
+            len(self._ordered_but_unwritten_items) >= self._min_items_to_write
+            or (time.time() - self._last_write_time > MAX_TIME_BETWEEN_WRITES)
+            or any_shard_finished_reading
+        ):
+            time_in = time.time()
+            self._tree_store.extend(self._ordered_but_unwritten_items)
+            time_out = time.time()
+            logger.debug(f"Wrote {len(self._ordered_but_unwritten_items)} rows in {time_out - time_in:.2f} seconds")
+            self._ordered_but_unwritten_items = []
+
+            written_by_shard = self._batches_in_next_write_by_shard
+            self._batches_in_next_write_by_shard = {}
+            self._last_write_time = time.time()
+            return written_by_shard
+        else:
+            return {}
+
+    def _check_for_finished_shards(self):
+        futures_to_await_shards = []
+        need_to_commit_for_shards = False
+        for shard, expected_rows in self._expected_num_rows.items():
+            if expected_rows is None:
+                continue
+
+            current_rows = self._ledger.shard_rows.get(shard, 0)
+            if current_rows == expected_rows:
+                if shard not in self._ledger.finished_shards:
+                    logger.info(f"Shard {shard} finished.")
+                    self._ledger.finished_shards.append(shard)
+                    futures_to_await_shards.append(self._parent.shard_finished.remote(shard))
+                    need_to_commit_for_shards = True
+            elif current_rows > expected_rows:
+                raise ValueError(f"Shard {shard} has more rows than expected: {current_rows} > {expected_rows}")
+
+        if len(self._ledger.finished_shards) == len(self.shards) and set(self._ledger.finished_shards) == set(
+            self.shards
+        ):
+            self._ledger.is_finished = True
+            need_to_commit_for_shards = True
+        return futures_to_await_shards, need_to_commit_for_shards
+
+    def is_overwhelmed(self) -> bool:
+        max_queue_size = self._min_items_to_write * 3
+        return self._total_queue_length > max_queue_size
+
+
+def _to_list_of_dicts(batch: dict) -> List[dict]:
+    """
+    Convert a batch of dictionaries to a list of dictionaries, suitable for writing to a cache.
+    """
+    keys = list(batch.keys())
+    values = list(batch.values())
+    num_rows = len(values[0])
+    return [{key: values[i][j] for i, key in enumerate(keys)} for j in range(num_rows)]
+
+
+def _canonicalize_batch(batch: Union[dict, List[dict]]) -> List[dict]:
+    if isinstance(batch, pa.RecordBatch):
+        batch = dict_from_record_batch(batch)
+
+    if isinstance(batch, dict):
+        return _to_list_of_dicts(batch)
+    else:
+        return batch
+
+
+# thinking through the design of the cache system
+
+# we decided to use Ray, which was maybe a mistake, but here we are.
+# Ray doesn't like it when the number of actors gets too large, so we can't have one actor per shard.
+# we have N nodes and K shards.
+
+# at a high level, we have 3 steps:
+# 1. read batches from the shard source
+# 2. process batches
+# 3. write batches to the cache for that shard
+
+# The difficulty is that we want parallelism, and we want to control the order of the written data.
+# Reading batches requires CPU and network.
+# ==> This means we should limit the number of shard groups to roughly the number of nodes, maybe times 2.
+# We ideally want to read from shards roughly evenly (at least within a group of shards)
+
+
+def _shard_reader_generator(shard_source: ShardedDataSource[T], shard_name: str, start_row: int, batch_size: int):
+    shard_iter = shard_source.open_shard_at_row(shard_name, start_row)
+    batch = []
+    for row in shard_iter:
+        batch.append(row)
+
+        if len(batch) == batch_size:
+            yield batch
+            batch = []
+
+    if len(batch) > 0:
+        yield batch
+
+
+@dataclass
+class ShardGroupToBeProcessed(PriorityWorkTaskGroupSpec):
+    name: str
+    builder_ref: ray.actor.ActorHandle  # _TreeStoreCacheBuilder
+    writer: ray.actor.ActorHandle  # _GroupedShardWriter
+    shard_source: ShardedDataSource
+    shard_names: Sequence[str]
+    priority_fn: Callable[[int, int], float]
+    processor_actor: ray.actor.ActorHandle  # BatchProcessorQueue
+    batch_size: int
+    group_id: int
+
+    def build(self) -> "PriorityWorkTaskGroup":
+        return ShardGroupTaskGroup(self)
+
+
+class ShardGroupTaskGroup(PriorityWorkTaskGroup):
+    def __init__(self, spec: ShardGroupToBeProcessed):
+        self.spec: ShardGroupToBeProcessed = spec
+        self.logger = pylogging.getLogger(f"shard_reader.{spec.group_id}.{spec.name}")
+
+        current_shard_status: dict[str, ShardStatus] = {}
+        for shard_name in self.spec.shard_names:
+            try:
+                current_shard_status[shard_name] = ray.get(self.spec.writer.get_shard_status.remote(shard_name))
+            except Exception as e:
+                self.spec.builder_ref.shard_failed.remote(shard_name, ser_exc_info())
+                raise e
+
+        batch_size = self.spec.batch_size
+
+        self._items: list[PriorityWorkItem] = []
+
+        for shard_name in self.spec.shard_names:
+            try:
+                status = current_shard_status[shard_name]
+                if status.is_finished:
+                    self.logger.info(f"Shard {shard_name} already finished. Skipping.")
+                    continue
+
+                reader = _shard_reader_generator(
+                    self.spec.shard_source, shard_name, status.num_rows_committed, batch_size
+                )
+
+                task_name = f"shard_reader.{self.spec.name}.{shard_name}"
+
+                batch_idx = status.num_rows_committed // batch_size
+
+                shard_idx = self.spec.shard_source.shard_names.index(shard_name)
+                item = ShardReaderItem(
+                    self,
+                    task_name,
+                    shard_name,
+                    shard_idx,
+                    batch_idx=batch_idx,
+                    reader=reader,
+                    current_row=status.num_rows_committed,
+                )
+
+                heapq.heappush(self._items, item)
+            except Exception as e:
+                self.logger.exception(f"Error while initializing shard {shard_name}")
+                self.spec.writer[shard_name].shard_failed.remote(ser_exc_info())
+                raise e
+
+    @property
+    def name(self):
+        return self.spec.name
+
+    def items(self) -> Sequence["PriorityWorkItem"]:
+        return self._items
+
+
+# NB This class is stateful
+@dataclass
+class ShardReaderItem(PriorityWorkItem):
+    """
+    Each time execute is called, this class reads a batch of examples from the shard
+    and dispatches them to the processor.
+    """
+
+    group: ShardGroupTaskGroup
+    name: str
+    shard_name: str
+    shard_idx: int
+    batch_idx: int
+    reader: Iterator[list]
+    current_row: int = 0
+
+    @property
+    def priority(self):
+        return self.group.spec.priority_fn(self.shard_idx, self.batch_idx)
+
+    @property
+    def spec(self):
+        return self.group.spec
+
+    def execute(self) -> tuple[bool, Optional[ray.ObjectRef]]:
+        writer = self.spec.writer
+        write_finished_ref = None
+
+        self.group.logger.debug(f"Reading one batch of shard {self.shard_name}: {self.batch_idx}")
+
+        try:
+            batch = next(self.reader, None)
+            exhausted_shard = batch is None or (len(batch) < self.spec.batch_size)
+
+            if batch:
+                priority = self.spec.priority_fn(self.shard_idx, self.batch_idx)
+                try:
+                    batch_result_ref = ray.get(
+                        self.spec.processor_actor.submit.remote(
+                            priority=priority,
+                            desc=f"{self.shard_name}.{self.batch_idx}",
+                            batch=RefBox(ray.put(batch)),
+                        )
+                    )
+                    logger.debug(f"Got batch result: {batch_result_ref}")
+                    write_finished_ref = writer.batch_finished.remote(
+                        self.shard_name, self.batch_idx, RefBox(batch_result_ref)
+                    )
+                    self.batch_idx += 1
+                    self.current_row += len(batch)
+                except Exception as e:
+                    self.group.logger.exception(f"Error while processing batch {self.batch_idx}")
+                    # fire and forget
+                    writer.shard_failed.remote(self.shard_name, self.batch_idx, ser_exc_info())
+                    raise e
+
+            if exhausted_shard:
+                logger.info(f"Shard {self.shard_name} exhausted. Expecting {self.current_row} rows.")
+                writer.shard_finished_reading.remote(self.shard_name, self.current_row)
+
+            self.group.logger.debug(f"Finished reading one batch of shard {self.shard_name}: {self.batch_idx}")
+
+            return exhausted_shard, write_finished_ref
+        except Exception as e:  # noqa
+            self.group.logger.exception(f"Error while processing shard {self.shard_name}")
+            # fire and forget
+            writer.shard_failed.remote(self.shard_name, self.batch_idx, ser_exc_info())
+            raise e
+
+
+def _serialize_json_and_commit(path, obj):
+    # just to be paranoid, we write to a temp file and then rename it
+    # TODO: probably we could do better here
+    with fsspec.open(f"{path}.tmp", "w") as file:
+        file.write(obj.to_json())
+    # now copy the old file to a backup
+    fs: AbstractFileSystem = fsspec.core.url_to_fs(path)[0]
+    fs.mkdirs(os.path.dirname(path), exist_ok=True)
+    if fs.exists(path):
+        fs.copy(path, f"{path}.bak")
+    fs.rename(f"{path}.tmp", path)
+
+
+def _load_cache_ledger(cache_dir) -> CacheLedger:
+    try:
+        ledger_path = os.path.join(cache_dir, LEDGER_FILE_NAME)
+        logger.debug(f"Attempting to load cache ledger from {ledger_path}")
+        with fsspec.open(ledger_path) as file:
+            cache_ledger = CacheLedger.from_json(file.read())  # type: ignore
+        return cache_ledger
+    except FileNotFoundError:
+        raise FileNotFoundError(f"Cache ledger not found at {ledger_path}")
+
+
+@ray.remote(num_cpus=0.1)  # keep this small b/c it doesn't do a lot
+class _TreeStoreCacheBuilder(SnitchRecipient):
+    """
+    Actor that coordinates the building of a cache. It spins up a bunch of workers to read from each shard
+    and write to the cache.
+
+    """
+
+    def __init__(
+        self,
+        cache_dir: str,
+        name: str,
+        source: ShardedDataSource[T],
+        processor: BatchProcessor[T, U],
+        cache_config: Dict[str, Any],
+        min_items_to_write: int,
+    ):
+        pylogging.basicConfig(level=DEFAULT_LOG_LEVEL, format=LOG_FORMAT)
+        self.logger = pylogging.getLogger(f"{__name__}.{name}")
+        self.source = source
+        self._cache_dir = cache_dir
+        # self._metrics = InProgressCacheMetrics()
+        self._updated_ledger_condition = asyncio.Condition()
+        self._ledger = CacheLedger(0, {})
+        self.shards_in_progress: set[str] = set()
+        exemplar = processor.output_exemplar
+
+        self._finished_promise: asyncio.Future[None] = asyncio.Future()
+        # used to subscribe to metrics updates
+        self._cache_config = cache_config
+        path_for_name = os.path.join(*self._cache_dir.split("/")[-2:])
+        name = f"broker::{path_for_name}"
+        self.logger = pylogging.getLogger(f"{name}")
+        self._cache_writer: Optional[ActorHandle] = _OrderedCacheWriter.remote(  # type: ignore
+            current_actor_handle(),
+            f"writer::{path_for_name}",
+            exemplar,
+            processor.batch_size,
+            cache_dir,
+            source.shard_names,
+            min_items_to_write,
+        )
+
+        try:
+            cache_ledger = _load_cache_ledger(self._cache_dir)
+            self._ledger = cache_ledger
+        except FileNotFoundError:
+            pass
+
+        if self._ledger.is_finished:
+            self._finished_promise.set_result(None)
+        self._start_workers(cache_dir, name, processor, source)
+
+    def _start_workers(self, cache_dir, name, processor, source):
+        if len(source.shard_names) == 0:
+            self.logger.warning("No shards to index?!?")
+            self._finalize()
+        else:
+            self.logger.debug(f"Starting cache build for {source.shard_names}")
+            self.logger.info(f"Starting cache build for {len(source.shard_names)} shards")
+
+            self_ref = current_actor_handle()
+
+            self._shard_writers = []
+            self._shard_readers = []
+            self._processor_actors = []
+
+            for shard_name in source.shard_names:
+                self.shards_in_progress.add(shard_name)
+
+            num_shards = len(source.shard_names)
+            num_worker_groups = len(ray.nodes())
+            num_shard_groups = max(min(num_worker_groups, num_shards), 1)
+
+            # if we have a bunch of caches to build with one shard, we don't want them all
+            # assigned to the same node, so we use an offset based on the hash of the name (for stability)
+            # in an attempt to spread them out
+            group_offset = int(hash(name) % num_worker_groups)
+
+            shard_groups: list[list[str]] = [[] for _ in range(num_shard_groups)]
+            for i, shard_name in enumerate(source.shard_names):
+                shard_groups[i % num_shard_groups].append(shard_name)
+
+            def priority_fn(shard_idx, batch_idx):
+                return batch_idx * num_shards + shard_idx
+
+            for group_id, shard_group in enumerate(shard_groups):
+                # TODO: would probably be better if we didn't create one of these per shard group
+                processor_actor = _BatchProcessorQueue.remote(processor)  # type: ignore
+                self._processor_actors.append(processor_actor)
+
+                assert self._cache_writer is not None
+
+                work_item = ShardGroupToBeProcessed(
+                    name=name,
+                    builder_ref=self_ref,
+                    writer=self._cache_writer,
+                    shard_source=source,
+                    shard_names=shard_group,
+                    priority_fn=priority_fn,
+                    processor_actor=processor_actor,
+                    batch_size=processor.batch_size,
+                    group_id=group_id,
+                )
+
+                # we want global names so that different tasks can coordinate priorities
+                worker_to_assign = (group_id + group_offset) % num_worker_groups
+                priority_actor_name = f"priority_processor.{worker_to_assign}"
+
+                reader_actor = WorkQueueDispatcherActor.options(  # type: ignore
+                    name=priority_actor_name, get_if_exists=True
+                ).remote()
+
+                reader_actor.assign_work.remote(work_item)
+                self._shard_readers.append(reader_actor)
+
+    def shard_finished(self, shard_name: str):
+        """Callback method for when a shard worker has finished."""
+        self.shards_in_progress.remove(shard_name)
+
+    def shard_failed(self, shard_name: str, error: ExceptionInfo):
+        """Callback method for when a shard worker has failed."""
+        self._writer_exception(shard_name, error)
+
+    def _updated_ledger(self, ledger: CacheLedger):
+        self._ledger = ledger
+        self._do_notify()
+
+    def other_failed(self, error: ExceptionInfo):
+        """Callback method for when a shard worker has failed."""
+        self._writer_exception(None, error)
+
+    def _child_failed(self, child: ray.actor.ActorHandle, exception: ExceptionInfo):
+        self.logger.error(f"Child {child} failed with exception", exc_info=exception.restore())
+        self._writer_exception(None, exception)
+
+    def is_finished(self):
+        return self._ledger.is_finished
+
+    async def finished_sentinel(self):
+        await self._finished_promise
+
+    async def updated_ledger(self) -> CacheLedger:
+        if self._finished_promise.done():
+            if self._finished_promise.exception() is not None:
+                raise self._finished_promise.exception()  # type: ignore
+            else:
+                return self._ledger
+
+        async with self._updated_ledger_condition:
+            await self._updated_ledger_condition.wait()
+            return self._ledger
+
+    def _writer_exception(self, shard_name, exc_info: ExceptionInfo):
+        info = exc_info.restore()
+
+        logger.exception(f"Writer task {shard_name} failed with exception", exc_info=info)
+
+        try:
+            self._finished_promise.set_exception(info[1])
+        except InvalidStateError:
+            pass
+        except concurrent.futures.InvalidStateError:
+            pass
+        self._do_notify()
+
+    def _do_notify(self):
+        async def _do_notify_async():
+            async with self._updated_ledger_condition:
+                self._updated_ledger_condition.notify_all()
+
+        asyncio.create_task(_do_notify_async())
+
+    def current_ledger(self):
+        return self._ledger
+
+    def _finalize(self):
+        logger.info(f"Finalizing cache {self._cache_dir}...")
+
+        self._ledger.is_finished = True
+        self._finished_promise.set_result(None)
+
+        # notify metrics subscribers
+        self._do_notify()
+        self._cache_writer = None
+
+    def signal_backpressure(self, next_item_desired: Optional[int]):
+        # get the priority of the item we want
+        if next_item_desired is not None:
+            self.logger.debug(f"Signaling backpressure for {next_item_desired}")
+            # our priority function above is basically (batch_index, shard_index). We just ask we don't get more
+            # than one round of batches ahead
+            max_priority = (next_item_desired + 1) * len(self.source.shard_names)
+
+            for reader in self._shard_readers:
+                reader.set_max_dispatch_priority.remote(max_priority)
+        else:
+            self.logger.debug("Signaling no backpressure")
+            for reader in self._shard_readers:
+                reader.set_max_dispatch_priority.remote(None)
+
+
+def _get_builder_actor(cache_dir, input_shards, processor, cache_config=None, items_per_write=MIN_ITEMS_TO_WRITE):
+    name = f"lev_cache_manager::{cache_dir}"
+    path_for_name = os.path.join(*os.path.split(cache_dir)[-2:])
+    name_for_display = f"builder::{path_for_name}"
+
+    return _TreeStoreCacheBuilder.options(name=name, get_if_exists=True).remote(  # type: ignore
+        name=name_for_display,
+        cache_dir=cache_dir,
+        source=input_shards,
+        processor=processor,
+        cache_config=cache_config,
+        min_items_to_write=items_per_write,
+    )
+
+
+class TreeCache(AsyncDataset[T_co]):
+    ledger: Optional[CacheLedger]
+    _broker: Optional[ActorHandle]
+    # monitor_thread waits for new metrics and also periodically reloads the cache
+    _monitor_thread: Optional[threading.Thread]
+    _metrics_monitors: List[MetricsMonitor]
+
+    def __init__(
+        self,
+        cache_dir: str,
+        exemplar: T_co,
+        ledger: Optional[CacheLedger],
+        _broker,  # handle of _TreeStoreCacheBuilder
+    ):
+        self.cache_dir = cache_dir
+        self.ledger = ledger
+        self._was_already_finished = ledger is not None and ledger.is_finished
+        self._broker = _broker
+        self._exemplar = exemplar
+
+        self._metrics_monitors = []
+        name = os.path.join(*cache_dir.split("/")[-2:])
+        self.logger = pylogging.getLogger(f"TreeCache.{name}")
+        self._store_future: threading_Future[TreeStore] = threading_Future()
+        self._stop = False
+
+        if self._broker is not None:
+            self._monitor_thread = threading.Thread(target=self._monitor_metrics, daemon=True)
+            self._monitor_thread.start()
+        else:
+            self._attempt_to_load_store()
+            assert self._store_future.done()
+
+    @property
+    def store(self) -> TreeStore[T_co]:
+        return self._store_future.result()
+
+    async def store_async(self) -> TreeStore[T_co]:
+        if self._broker is not None:
+            return await asyncio.wrap_future(self._store_future)
+        else:
+            return self.store
+
+    async def async_len(self) -> int:
+        if self._broker is not None:
+            self.await_finished()
+
+        return len(await self.store_async())
+
+    def __len__(self):
+        self.await_finished()
+
+        return len(self.store)
+
+    async def final_length_is_known(self) -> bool:
+        if self._broker is not None:
+            return await self._broker.is_finished.remote()
+
+        return True
+
+    def is_finite(self) -> bool:
+        return True
+
+    async def current_len(self) -> int:
+        if not self._store_future.done():
+            return 0
+
+        return len(await self.store_async())
+
+    async def get_batch(self, indices: Sequence[int] | slice):
+        # this is tricky: we want to wait until either the cache is finished or we have the max index
+        if isinstance(indices, slice):
+            start, step, stop = await self._get_start_stops_async(indices)
+            await self._wait_for_len(max(stop, start))
+            indices = range(start, stop, step)
+
+        max_index = max(indices)
+        await self._wait_for_len(max_index + 1)
+
+        return await self.store.get_batch(indices)
+
+    async def _wait_for_len(self, needed_len):
+        if self._broker is not None:
+            while needed_len > await self.current_len():
+                new_ledger = await self._broker.updated_ledger.remote()
+
+                if needed_len <= new_ledger.total_num_rows:
+                    break
+
+                if new_ledger.is_finished:
+                    if needed_len >= new_ledger.rows_finished:
+                        raise IndexError(
+                            f"Index {needed_len} out of bounds for cache of size {new_ledger.total_num_rows}"
+                        )
+                    break
+        else:
+            if needed_len > len(self.store):
+                raise IndexError(f"Index {needed_len} out of bounds for cache of size {len(self.store)}")
+
+    def _wait_for_len_sync(self, needed_len, timeout: Optional[float] = None):
+        time_in = time.time()
+        t_max = time_in + (timeout or 1e6)
+        if self._broker is not None:
+            while needed_len > len(self.store):
+                cur_time = time.time()
+                if cur_time > t_max:
+                    raise TimeoutError(f"Timed out waiting for cache to reach {needed_len}")
+                try:
+                    new_ledger = ray.get(self._broker.updated_ledger.remote(), timeout=max(t_max - cur_time, 10))
+                except TimeoutError:
+                    continue
+
+                if needed_len <= new_ledger.total_num_rows:
+                    break
+
+                if new_ledger.is_finished:
+                    if needed_len >= new_ledger.rows_finished:
+                        raise IndexError(
+                            f"Index {needed_len} out of bounds for cache of size {new_ledger.total_num_rows}"
+                        )
+                    break
+        else:
+            if needed_len > len(self.store):
+                raise IndexError(f"Index {needed_len} out of bounds for cache of size {len(self.store)}")
+
+    @staticmethod
+    def load(cache_dir: str, exemplar: T) -> "TreeCache":
+        """Loads a cache from disk or an object store. Raises FileNotFoundError if the cache doesn't exist"""
+        logger.info(f"Loading cache from {cache_dir}")
+        ledger = _load_cache_ledger(cache_dir)
+        if not ledger.is_finished:
+            raise FileNotFoundError(f"Cache at {cache_dir} is not finished. Use build_or_load to build it.")
+        return TreeCache(cache_dir, exemplar, ledger, None)
+
+    @staticmethod
+    def build_or_load(
+        cache_dir: str,
+        shard_source: ShardedDataSource[T],
+        processor: BatchProcessor[T, U],
+        cache_config: Optional[Dict[str, Any]] = None,
+        items_per_write: int = MIN_ITEMS_TO_WRITE,
+    ) -> "TreeCache[U]":
+        try:
+            return TreeCache.load(cache_dir, processor.output_exemplar)
+        except FileNotFoundError:
+            broker = _get_builder_actor(
+                cache_dir=cache_dir,
+                input_shards=shard_source,
+                processor=processor,
+                cache_config=cache_config,
+                items_per_write=items_per_write,
+            )
+            return TreeCache(cache_dir=cache_dir, exemplar=processor.output_exemplar, ledger=None, _broker=broker)
+
+    def finished_sentinel(self):
+        """Returns a Ray-awaitable object that will be set when the cache is finished"""
+        if self._broker is None:
+            return ray.remote(num_cpus=0)(lambda: None).remote()
+        else:
+            return self._broker.finished_sentinel.remote()
+
+    @property
+    def is_finished(self):
+        if self._broker is None:
+            return True
+        else:
+            return ray.get(self._broker.is_finished.remote())
+
+    def __getitem__(self, item):
+        if isinstance(item, slice):
+            start, step, stop = self._get_start_stops(item)
+            # TODO: wait for store to be set
+            return self.store[start:stop:step]
+        else:
+            if item < 0:
+                item += len(self)
+            if item < 0 or item >= len(self):
+                raise IndexError(f"Index {item} out of bounds for cache of size {len(self)}")
+            return self.store[item]
+
+    def get_batch_sync(self, indices_or_slice, *, timeout: Optional[float] = None):
+        store = self.store
+        if isinstance(indices_or_slice, slice):
+            start, step, stop = self._get_start_stops(indices_or_slice)
+            indices_or_slice = range(start, stop, step)
+
+        max_index = max(indices_or_slice)
+
+        self._wait_for_len_sync(max_index + 1, timeout=timeout)
+
+        return store.get_batch_sync(indices_or_slice)
+
+    def _get_start_stops(self, slice):
+        start = slice.start or 0
+        if slice.stop is None:
+            stop = len(self)
+        elif slice.stop < 0:
+            stop = len(self) + slice.stop
+        else:
+            stop = slice.stop
+        if start < 0:
+            start = len(self) + slice.start
+        step = slice.step or 1
+        return start, step, stop
+
+    async def _get_start_stops_async(self, slice):
+        start = slice.start or 0
+        if slice.stop is None:
+            stop = await self.async_len()
+        elif slice.stop < 0:
+            stop = (await self.async_len()) + slice.stop
+        else:
+            stop = slice.stop
+        if start < 0:
+            start = (await self.async_len()) + slice.start
+
+        step = slice.step or 1
+        return start, step, stop
+
+    def await_finished(self, timeout: Optional[float] = None):
+        x = ray.get(self.finished_sentinel(), timeout=timeout)
+        self._attempt_to_load_store()
+        return x
+
+    async def finished(self):
+        x = await self.finished_sentinel()
+        # TODO: make an async version of this
+        self._attempt_to_load_store()
+        return x
+
+    def _attempt_to_load_store(self):
+        if self._store_future.done():
+            return
+
+        try:
+            store = TreeStore.open(self._exemplar, self.cache_dir, mode="r")
+        except FileNotFoundError:
+            logger.error(f"Cache at {self.cache_dir} not found.")
+            assert self._broker is not None
+            ledger = ray.get(self._broker.current_ledger.remote())
+            metrics = _ledger_to_metrics(ledger)
+            if metrics.rows_finished == 0 and metrics.is_finished:
+                # this means we built an empty cache. go with it
+                store = TreeStore.open(self._exemplar, f"memory://{self.cache_dir}", mode="a")
+            else:
+                raise
+        try:
+            self._store_future.set_result(store)
+        except concurrent.futures.InvalidStateError:
+            pass
+
+    def attach_metrics_monitor(self, monitor: MetricsMonitor):
+        if self._broker is None:
+            logger.warning("Cannot attach metrics monitor to finished cache.")
+            # TODO: decide what to do about attaching if the cache is already finished
+            # maybe get the final metrics?
+            return
+
+        self._metrics_monitors.append(monitor)
+
+    def _monitor_metrics(self):
+        while not self._stop:
+            try:
+                try:
+                    ledger = ray.get(self._broker.updated_ledger.remote(), timeout=10.0)
+                    metrics = _ledger_to_metrics(ledger)
+                    for monitor in self._metrics_monitors:
+                        monitor(metrics)
+                    if metrics.is_finished:
+                        break
+                except TimeoutError:
+                    pass
+                except Exception as e:
+                    if str(e).startswith("Failed to submit task to actor"):
+                        logger.warning("Cache builder actor is gone. Stopping monitoring.")
+                        break
+                try:
+                    self._attempt_to_load_store()
+                except FileNotFoundError:
+                    pass
+            except Exception as e:
+                if str(e).startswith("Failed to submit task to actor"):
+                    logger.warning("Cache builder actor is gone. Stopping monitoring.")
+                    break
+                else:
+                    self.logger.exception("Error while reading metrics from shard cache.")
+                    raise e
+
+
+def _ledger_to_metrics(ledger: CacheLedger) -> InProgressCacheMetrics:
+    return InProgressCacheMetrics(
+        rows_finished=ledger.total_num_rows,
+        is_finished=ledger.is_finished,
+        # shard_rows=ledger.shard_rows,
+        # finished_shards=ledger.finished_shards,
+        field_counts=ledger.field_counts,
+    )
+
+
+class GroupRoundRobinBuffer(Generic[T]):
+    """
+    A buffer that holds items from multiple groups and returns them in a round-robin fashion.
+    The groups need not have the same number of items. If a group is exhausted, it is removed from the rotation.
+    """
+
+    def __init__(self, groups: Sequence[str]):
+        self.groups = groups
+        self._current_group = 0
+        self.buffers: dict[str, list[tuple[int, T]]] = {group: [] for group in groups}
+        self._remaining_groups = set(groups)
+        self._totals_written: dict[str, int] = {group: 0 for group in groups}
+        self._totals_expected: dict[str, Optional[int]] = {group: None for group in groups}
+
+    def __len__(self):
+        return sum(len(buffer) for buffer in self.buffers.values())
+
+    def append_to_group(self, group: str, item_serial: int, item: T):
+        if group not in self.groups:
+            raise ValueError(f"Group {group} not in {self.groups}")
+
+        if group not in self._remaining_groups:
+            raise ValueError(f"Group {group} already finished")
+
+        logger.debug(f"Appending item {item_serial} to {group}")
+
+        heapq.heappush(self.buffers[group], (item_serial, item))
+
+    def group_total_known(self, group: str, total: int):
+        if group not in self.groups:
+            raise ValueError(f"Group {group} not in {self.groups}")
+
+        if group not in self._remaining_groups:
+            raise ValueError(f"Group {group} already finished: {total} vs {self._totals_expected[group]}")
+
+        self._totals_expected[group] = total
+
+        if self._totals_written[group] == total:
+            assert len(self.buffers[group]) == 0
+            self._remaining_groups.remove(group)
+        elif self._totals_written[group] > total:
+            raise ValueError(f"Group {group} has written more than expected: {self._totals_written[group]} > {total}")
+
+    def is_finished(self):
+        return len(self._remaining_groups) == 0
+
+    def pop(self) -> Optional[tuple[str, T]]:
+        group = self._next_group_to_read_from()
+        if group is None:
+            return None
+
+        if len(self.buffers[group]) == 0:
+            return None
+
+        cur_serial, item = self.buffers[group][0]
+
+        # logger.debug(
+        #     f"group: {group}, cur_serial: {cur_serial}, totals_written: {self._totals_written[group]},"
+        #     f" totals_expected: {self._totals_expected.get(group)}"
+        # )
+
+        if cur_serial > self._totals_written[group]:
+            return None
+        elif cur_serial < self._totals_written[group]:
+            raise ValueError(f"Duplicate serial {cur_serial} for group {group}")
+
+        heapq.heappop(self.buffers[group])
+        logger.debug(f"Read item {cur_serial} from {group}")
+
+        self._totals_written[group] += 1
+
+        if self._totals_written[group] == self._totals_expected[group]:
+            assert len(self.buffers[group]) == 0
+            assert group in self._remaining_groups
+            self._remaining_groups.remove(group)
+
+        self._current_group = (self._current_group + 1) % len(self.groups)
+
+        return group, item
+
+    def drain(self) -> Iterator[tuple[str, T]]:
+        while True:
+            item = self.pop()
+            if item is None:
+                break
+            yield item
+
+    def _next_group_to_read_from(self):
+        """
+        Returns the next group to read from. This is always the group with the least that is not finished.
+        """
+        if len(self._remaining_groups) == 0:
+            return None
+
+        # careful: this is only correct if self._current_group is correct. whenever we fast forward, we have to
+        # recompute it
+        while True:
+            group = self.groups[self._current_group]
+            if group not in self._remaining_groups:
+                assert self._totals_written[group] == self._totals_expected[group]
+                assert len(self.buffers[group]) == 0
+                self._current_group = (self._current_group + 1) % len(self.groups)
+            else:
+                break
+        return group
+
+    def fast_forward(self, group, num_rows):
+        """
+        Fast forwards the buffer for a group to a certain number of rows. This sets the "next" item to be the
+        num_rows-th item.
+        """
+        if group not in self.groups:
+            raise ValueError(f"Group {group} not in {self.groups}")
+
+        if self._totals_written[group] != 0:
+            raise ValueError(f"Group {group} already written to: {self._totals_written[group]}")
+
+        self._totals_written[group] = num_rows
+
+        self._fix_current_group()
+
+    def _fix_current_group(self):
+        # This is always the minimum total written group that is not finished
+        self._current_group = 0
+        min_total = None
+
+        for i, group in enumerate(self.groups):
+            if group not in self._remaining_groups:
+                continue
+            total = self._totals_written[group]
+            if min_total is None or total < min_total:
+                min_total = total
+                self._current_group = i
+
+    def next_missing_item_index(self):
+        """
+        Returns the index of the next item that is not in the buffer
+        (i.e. what's stopping us from yielding the next item).
+        """
+        if len(self._remaining_groups) == 0:
+            return None
+
+        group = self.groups[self._current_group]
+        if group not in self._remaining_groups:
+            self._fix_current_group()
+            return self.next_missing_item_index()
+
+        if len(self.buffers[group]) == 0:
+            return self._totals_written[group]
+
+        cur_serial, _ = self.buffers[group][0]
+
+        if cur_serial > self._totals_written[group]:
+            return self._totals_written[group]
+        elif cur_serial < self._totals_written[group]:
+            raise ValueError(f"Duplicate serial {cur_serial} for group {group}")
+
+        return None
+
+
+def div_round_up(x, y):
+    return (x + y - 1) // y
diff --git a/src/levanter/store/jagged_array.py b/src/levanter/store/jagged_array.py
new file mode 100644
index 000000000..8b3a26a54
--- /dev/null
+++ b/src/levanter/store/jagged_array.py
@@ -0,0 +1,508 @@
+import asyncio
+import os
+from dataclasses import dataclass
+from typing import Optional, Sequence
+
+import fsspec.core
+import jax
+import jax.experimental.array_serialization.serialization as ser
+import jax.numpy as jnp
+import numpy as np
+import tensorstore as ts
+
+from levanter.utils import fsspec_utils
+from levanter.utils.thread_utils import future_from_value
+
+
+# zarr suggests 1MB chunk size (in bytes, but whatever)
+# at 4 bytes this is 256k elements
+DEFAULT_CHUNK_SIZE = 256 * 1024
+DEFAULT_WRITE_CHUNK_SIZE = DEFAULT_CHUNK_SIZE * 512
+
+
+@dataclass
+class JaggedArrayStore:
+    """
+    A jagged array is a collection of arrays of varying lengths.
+    We represent this as a single array with an accompanying array of offsets.
+
+    Note that JAX doesn't really support jagged arrays, so we have to be careful about how we use them.
+    Typically, we just use these for data loading.
+
+    PERFORMANCE: accessing an individual row (or a single small slice of the underlying data) is very slow.
+    Where ever possible, use get_batch to get multiple rows at once for as large a batch as possible.
+    High latency, but high throughput.
+    """
+
+    offsets: ts.TensorStore  # offsets of the start of each array, except that index[0] is the number of arrays
+    data: ts.TensorStore
+    shapes: Optional[ts.TensorStore]  # (len(offsets), len(data.shape)-1)
+    item_rank: int = 1
+
+    @staticmethod
+    async def open_async(path: Optional[str], *, mode="a", item_rank=1, dtype) -> "JaggedArrayStore":
+        offset_path = _extend_path(path, "offsets")
+        offsets = _ts_open_async(offset_path, jnp.int64, [1], mode=mode)
+
+        data_path = _extend_path(path, "data")
+        data = _ts_open_async(data_path, dtype, [0], mode=mode)
+
+        if item_rank > 1:
+            shape_path = _extend_path(path, "shapes")
+            shapes = _ts_open_async(shape_path, jnp.int64, [0, item_rank - 1], mode=mode)
+        else:
+            shapes = None
+
+        return JaggedArrayStore(await offsets, await data, await shapes if shapes is not None else None, item_rank)
+
+    @staticmethod
+    def open(path: Optional[str], *, mode="a", item_rank=1, dtype) -> "JaggedArrayStore":
+        offset_path = _extend_path(path, "offsets")
+        offsets = _ts_open_sync(offset_path, jnp.int64, [1], mode=mode)
+
+        data_path = _extend_path(path, "data")
+        data = _ts_open_sync(data_path, dtype, [0], mode=mode)
+
+        if item_rank > 1:
+            shape_path = _extend_path(path, "shapes")
+            shapes = _ts_open_sync(shape_path, jnp.int64, [0, item_rank - 1], mode=mode)
+        else:
+            shapes = None
+
+        return JaggedArrayStore(offsets, data, shapes, item_rank)
+
+    @property
+    def num_rows(self):
+        return int(self.offsets[0].read().result())
+
+    async def num_rows_async(self):
+        return int(await self.offsets[0].read())
+
+    @property
+    def data_size(self):
+        return int(self.offsets[self.num_rows].read().result())
+
+    async def append_async(self, data: jax.Array):
+        await self.extend_async([data])
+
+    def append(self, data: jax.Array):
+        self.extend([data])
+
+    async def trim_to_size_async(self, size: int):
+        """
+        Trims so we have exactly `size` rows in the jagged array.
+        """
+        if size >= len(self):
+            return
+
+        current_data_size = self.data_size
+        current_num_rows = await self.num_rows_async()
+
+        offsets_fut = self.offsets[size + 1 : current_num_rows + 1].write(0)
+
+        if size == 0:
+            new_max = 0
+        else:
+            new_max = int(await self.offsets[size].read())
+
+        f1 = self.offsets[0].write(size)
+
+        # Trim the shapes
+        if self.shapes is not None:
+            shape_fut = self.shapes[size:current_num_rows].write(
+                np.zeros(self.shapes.shape[1:], dtype=self.shapes.dtype.name)
+            )
+        else:
+            shape_fut = None
+
+        data_fut = self.data[new_max:current_data_size].write(np.zeros((), dtype=self.data.dtype.name))
+        await f1
+
+        await shape_fut if shape_fut is not None else None
+        await data_fut
+        await offsets_fut
+
+    def trim_to_size(self, size: int):
+        if size >= self.num_rows:
+            return
+
+        old_len = len(self)
+        old_data_size = self.data_size
+
+        if self.shapes is not None:
+            shape_fut = self.shapes[size:old_len].write(np.zeros(self.shapes.shape[1:], dtype=self.shapes.dtype.name))
+        else:
+            shape_fut = None
+
+        f1 = self.offsets[0].write(size)
+
+        if size == 0:
+            new_max = 0
+        else:
+            new_max = int(self.offsets[size].read().result())
+        data_fut = self.data[new_max:old_data_size].write(np.zeros((), dtype=self.data.dtype.name))
+
+        f1.result()
+        offsets_fut = self.offsets[size + 1 : old_data_size + 1].write(0)
+
+        data_fut.result()
+        offsets_fut.result()
+
+        if shape_fut is not None:
+            shape_fut.result()
+
+    async def extend_async(self, arrays: Sequence[jax.Array]):
+        data, new_offsets, shapes = self._prepare_batch(arrays)
+
+        num_rows = await self.num_rows_async()
+        num_added = len(arrays)
+        current_data_size = self.data_size
+
+        # Write to resized arrays concurrently, adjusting offsets explicitly
+        write_tasks = [
+            self.data[current_data_size : current_data_size + len(data)].write(data),
+            self.offsets[num_rows + 1 : num_rows + num_added + 1].write(new_offsets),
+        ]
+        if self.shapes is not None:
+            write_tasks.append(self.shapes[num_rows : num_rows + num_added].write(shapes))
+
+        await asyncio.gather(*write_tasks)
+
+        # Update num_rows
+        int(self.offsets[self.num_rows].read().result())
+        await self.offsets[0].write(num_rows + len(arrays))
+        # print("done")
+
+    def extend(self, arrays: Sequence[jax.Array]):
+        data, new_offsets, shapes = self._prepare_batch(arrays)
+
+        num_rows = self.num_rows
+        num_added = len(arrays)
+        current_data_size = self.data_size
+
+        write_tasks = [
+            self.data[current_data_size : current_data_size + len(data)].write(data),
+            self.offsets[num_rows + 1 : num_rows + num_added + 1].write(new_offsets),
+        ]
+        if self.shapes is not None:
+            write_tasks.append(self.shapes[num_rows : num_rows + num_added].write(shapes))
+
+        for task in write_tasks:
+            task.result()
+
+        # Update num_rows. We want to make sure this comes after the other data is committed to avoid a race
+        self.offsets[0].write(num_rows + len(arrays)).result()
+
+    def _prepare_batch(self, arrays):
+        if self.shapes is not None:
+            for data in arrays:
+                if data.ndim != self.item_rank:
+                    raise ValueError(f"Expected data to have rank {self.item_rank}, got {data.ndim}")
+            shapes = np.array([data.shape[:-1] for data in arrays], dtype=np.int64)
+        else:
+            for data in arrays:
+                if data.ndim > 1:
+                    raise ValueError(f"Expected data to have rank 1, got {data.ndim}")
+            shapes = None
+        new_offsets = np.array([data.size for data in arrays], dtype=np.int64)
+        new_offsets = np.cumsum(new_offsets) + self.data_size
+        data = np.concatenate([data.reshape(-1) for data in arrays])
+        return data, new_offsets, shapes
+
+    async def reload_async(self) -> "JaggedArrayStore":
+        """
+        Calls `resolve` on the underlying tensorstore objects, updating size information
+
+        @return: new JaggedArrayStore with resolved tensorstores
+        """
+        offsets = ts.open(_unshaped_spec(self.offsets))
+        data = ts.open(_unshaped_spec(self.data))
+        shapes = future_from_value(None) if self.shapes is None else ts.open(_unshaped_spec(self.shapes.spec()))
+
+        offsets, data, shapes = await asyncio.gather(offsets, data, shapes)
+
+        return JaggedArrayStore(offsets, data, shapes, self.item_rank)
+
+    def reload(self) -> "JaggedArrayStore":
+        offsets = ts.open(_unshaped_spec(self.offsets))
+        data = ts.open(_unshaped_spec(self.data))
+        shapes = None if self.shapes is None else ts.open(_unshaped_spec(self.shapes.spec())).result()
+
+        offsets = offsets.result()
+        data = data.result()
+
+        return JaggedArrayStore(offsets, data, shapes, self.item_rank)
+
+    def __len__(self):
+        return self.num_rows
+
+    async def get_item_async(self, item):
+        if isinstance(item, slice):
+            raise NotImplementedError("Slicing not supported")
+            len_self = await self.num_rows_async()
+            start, stop, step = item.indices(len_self)
+            if step != 1:
+                raise ValueError("JaggedArrayStore doesn't support slicing with step != 1")
+            shapes = None if self.shapes is None else self.shapes[start:stop]
+            # NB: JaggedArray not JaggedArrayStore
+            # TODO: use a transformed TS?
+            data_start, data_stop, offsets = await self._bounds_for_rows_async(start, stop)
+            new_offsets = offsets - offsets[0]
+            return JaggedArray(new_offsets, await self.data[data_start:data_stop].read(), shapes)
+        else:
+            try:
+                start, stop, _ = await self._bounds_for_rows_async(item, item + 1)
+                data = await self.data[start:stop].read()
+
+                if self.shapes is not None:
+                    shapes = np.array(self.shapes[item])
+                    data = data.reshape(*shapes, -1)
+                return data
+            except ValueError as e:
+                # ts raises a value error for an index out of bounds OUT_OF_RANGE
+                if "OUT_OF_RANGE" in str(e):
+                    raise IndexError(f"JaggedArrayStore index out of range: {item}") from e
+                else:
+                    raise e
+
+    async def get_batch(self, indices: Sequence[int]) -> Sequence[jax.Array]:
+        # get indices
+        with ts.Batch():
+            all_indices_futs = [self._bounds_for_rows_async(indices[i], indices[i] + 1) for i in range(len(indices))]
+
+        # shapes, if applicable
+        if self.shapes is not None:
+            with ts.Batch():
+                shapes_futs = [self.shapes[i].read() for i in indices]
+
+        all_indices = [(start, stop) for start, stop, _ in await asyncio.gather(*all_indices_futs)]
+
+        # get data
+        with ts.Batch():
+            data_futs = [self.data[start:stop].read() for start, stop in all_indices]
+
+        data = await asyncio.gather(*data_futs)
+
+        if self.shapes is not None:
+            shapes = await asyncio.gather(*shapes_futs)
+
+            data = [d.reshape(*s, -1) for d, s in zip(data, shapes)]
+
+        return data
+
+    def get_batch_sync(self, indices: Sequence[int]) -> Sequence[jax.Array]:
+        all_indices = self._bounds_for_rows_batch(indices)
+
+        with ts.Batch():
+            # shapes, if applicable
+            if self.shapes is not None:
+                shapes_futs = [self.shapes[i].read() for i in indices]
+
+            data_futs = [self.data[start:stop].read() for start, stop in all_indices]
+
+        data = [d.result() for d in data_futs]
+
+        if self.shapes is not None:
+            shapes = [s.result() for s in shapes_futs]  # noqa
+            data = [d.reshape(*s, -1) for d, s in zip(data, shapes)]
+
+        return data
+
+    def __getitem__(self, item):
+        if isinstance(item, slice):
+            # raise NotImplementedError("Slicing not supported")
+            # # TODO: do we need to avoid reading len(self)?
+            # start, stop, step = item.indices(len(self))
+            # if step != 1:
+            #     raise ValueError("JaggedArrayStore doesn't support slicing with step != 1")
+            # shapes = None if self.shapes is None else self.shapes[start:stop]
+            # # NB: JaggedArray not JaggedArrayStore
+            # # TODO: use a transformed TS?
+            # data_start, data_stop, offsets = self._bounds_for_rows(start, stop)
+            # new_offsets = offsets - offsets[0]
+            # return JaggedArray(new_offsets, self.data[data_start:data_stop].read().result(), shapes)
+            start, stop, step = item.indices(len(self))
+            # for now, just read the data into a list
+
+            return [self[i] for i in range(start, stop, step)]
+        else:
+            try:
+                start, stop, _ = self._bounds_for_rows(item, item + 1)
+                data = self.data[start:stop].read().result()
+
+                if self.shapes is not None:
+                    shapes = np.array(self.shapes[item])
+                    data = data.reshape(*shapes, -1)
+                return data
+            except ValueError as e:
+                # ts raises a value error for an index out of bounds OUT_OF_RANGE
+                if "OUT_OF_RANGE" in str(e):
+                    raise IndexError(f"JaggedArrayStore index out of range: {item}") from e
+                else:
+                    raise e
+
+    def _bounds_for_rows(self, start, stop):
+        num_rows = self.num_rows
+        if start >= num_rows or stop > num_rows:
+            raise IndexError("Index out of bounds")
+        start, stop, step = slice(start, stop).indices(num_rows)
+        offsets = self.offsets[start : stop + 1].read().result()
+        data_start, data_stop = offsets[0], offsets[-1]
+        if start == 0:
+            # The first offset is the number of rows
+            data_start = 0
+            offsets[0] = 0
+
+        return data_start, data_stop, offsets
+
+    def _bounds_for_rows_batch(self, indices):
+        num_rows = self.num_rows
+        offsets_futs: list = []
+
+        zero_pos = None
+
+        with ts.Batch():
+            for index in indices:
+                if index >= num_rows or index < 0:
+                    raise IndexError("Index out of bounds")
+                offsets = self.offsets[index : index + 2].read()
+                offsets_futs.append(offsets)
+
+                if index == 0:
+                    zero_pos = len(offsets_futs) - 1
+
+        offsets = [fut.result() for fut in offsets_futs]
+        offsets = [(offset[0], offset[-1]) for offset in offsets]
+
+        if zero_pos is not None:
+            offsets[zero_pos] = [0, offsets[zero_pos][1]]
+
+        return offsets
+
+    async def _bounds_for_rows_async(self, start, stop):
+        offsets = await self.offsets[start : stop + 1].read()
+        data_start, data_stop = offsets[0], offsets[-1]
+        if start == 0:
+            # The first offset is the number of rows
+            data_start = 0
+            offsets[0] = 0
+
+        return data_start, data_stop, offsets
+
+
+def _unshaped_spec(store: ts.TensorStore) -> ts.Spec:
+    spec = store.spec(retain_context=True)
+    return spec
+
+
+def _ts_open_sync(path: Optional[str], dtype: jnp.dtype, shape, *, mode):
+    spec = _get_spec(path, shape)
+    mode = _mode_to_open_mode(mode)
+
+    # Basically, we want to load the existing shape metadata if it exists
+    if not mode.get("delete_existing", False):
+        try:
+            return ts.open(spec, **mode).result()
+        except FileNotFoundError:
+            pass
+        except ValueError:
+            pass
+
+    # TODO: groups?
+    # TODO: set chunk sizes
+    try:
+        return ts.open(
+            spec,
+            dtype=jnp.dtype(dtype).name,
+            shape=[2**54, *shape[1:]],
+            # chunk_layout=ts.ChunkLayout(
+            #     read_chunk_shape=[DEFAULT_CHUNK_SIZE, *shape[1:]],
+            #     write_chunk_shape=[DEFAULT_WRITE_CHUNK_SIZE, *shape[1:]]
+            # ),
+            # compression={"codec": "zstd", "compression_level": 5},
+            **mode,
+        ).result()
+    except ValueError as e:
+        if "NOT_FOUND" in str(e):
+            raise FileNotFoundError(f"File not found: {path}") from e
+        else:
+            raise e
+
+
+async def _ts_open_async(path: Optional[str], dtype: jnp.dtype, shape, *, mode):
+    spec = _get_spec(path, shape)
+    mode = _mode_to_open_mode(mode)
+
+    # Basically, we want to load the existing shape metadata if it exists
+    if not mode.get("delete_existing", False):
+        try:
+            return await ts.open(spec, **mode)
+        except FileNotFoundError:
+            pass
+        except ValueError:
+            pass
+
+    # TODO: groups?
+    # TODO: set chunk sizes
+    return await ts.open(
+        spec,
+        dtype=jnp.dtype(dtype).name,
+        shape=[2**54, *shape[1:]],
+        # chunk_layout=ts.ChunkLayout(
+        #     read_chunk_shape=[DEFAULT_CHUNK_SIZE, *shape[1:]],
+        #     write_chunk_shape=[DEFAULT_WRITE_CHUNK_SIZE, *shape[1:]]
+        # ),
+        # compression={"codec": "zstd", "compression_level": 5},
+        **mode,
+    )
+
+
+def _get_spec(path, shape):
+    if path is None:
+        import uuid
+
+        random_name = str(uuid.uuid4())
+        spec = ts.Spec({"driver": "zarr", "kvstore": f"memory://{random_name}"})
+    else:
+        # make path absolute if it's not already
+        protocol, _ = fsspec.core.split_protocol(path)
+        if protocol is None:
+            path = os.path.abspath(path)
+        spec = ser.get_tensorstore_spec(path, ocdbt=False)
+        store = spec.get("kvstore")
+        spec = {"driver": "zarr3", "kvstore": store}
+        fsspec_utils.mkdirs(os.path.dirname(path))
+        spec["metadata"] = {
+            "chunk_grid": {
+                "name": "regular",
+                "configuration": {"chunk_shape": [DEFAULT_WRITE_CHUNK_SIZE, *shape[1:]]},
+            },
+            "codecs": [
+                {
+                    "name": "sharding_indexed",
+                    "configuration": {
+                        "chunk_shape": [DEFAULT_CHUNK_SIZE, *shape[1:]],
+                        "codecs": [{"name": "blosc", "configuration": {"clevel": 5}}],
+                    },
+                }
+            ],
+        }
+    return spec
+
+
+def _mode_to_open_mode(mode: str):
+    if mode == "r":
+        return {"open_mode": ts.OpenMode(open=True)}
+    elif mode == "w":
+        return {"open_mode": ts.OpenMode(create=True, delete_existing=True)}
+    elif mode == "a":
+        return {"open_mode": ts.OpenMode(create=True, open=True, delete_existing=False)}
+    else:
+        raise ValueError(f"Invalid mode: {mode}")
+
+
+def _extend_path(path: Optional[str], extra: str):
+    if path == "memory" or path is None:
+        return path
+    else:
+        return os.path.join(path, extra)
diff --git a/src/levanter/store/stress_test_new_cache.py b/src/levanter/store/stress_test_new_cache.py
new file mode 100644
index 000000000..c583ede56
--- /dev/null
+++ b/src/levanter/store/stress_test_new_cache.py
@@ -0,0 +1,149 @@
+# Reads an old-style ShardCache and compares to
+import asyncio
+import logging
+import os
+
+import jax.random
+import numpy as np
+import tensorstore as ts
+
+from levanter.data import PermutationDataset
+from levanter.data.text import TokenSeqDataset
+from levanter.store.cache import LEDGER_FILE_NAME, CacheLedger, TreeCache, _serialize_json_and_commit
+from levanter.store.tree_store import TreeStore
+from levanter.tracker import capture_time
+from levanter.utils import fsspec_utils
+
+
+logging.basicConfig(level=logging.INFO)
+
+
+SEQ_LEN = 1024
+BS = 8
+BATCHES = 1000
+
+# want to test reading from:
+# 1) old cache sequentially
+# 2) new cache sequentially
+# 3) new cache randomly
+
+
+def bench_new_cache_serial(exemplar, new_cache_path):
+    jagged_array = TreeStore.open(exemplar, new_cache_path).tree["input_ids"]
+    len_cache = jagged_array.data_size
+    new_cache = jagged_array.data
+    num_batches = len_cache // SEQ_LEN
+    for b in range(BATCHES):
+        elems = []
+        with ts.Batch():
+            for j in range(BS):
+                idx = b * BS + j
+                idx = idx % num_batches
+                arr1 = new_cache[idx * SEQ_LEN : (idx + 1) * SEQ_LEN].read()
+                elems.append(arr1)
+
+        for elem in elems:
+            elem.result()
+
+
+def bench_new_cache_random(exemplar, new_cache_path):
+    jagged_array = TreeStore.open(exemplar, new_cache_path).tree["input_ids"]
+    len_cache = jagged_array.data_size
+    new_cache = jagged_array.data
+    num_batches = len_cache // SEQ_LEN
+    for b in range(BATCHES):
+        elems = []
+        with ts.Batch():
+            for j in range(BS):
+                idx = np.random.randint(0, num_batches)
+                arr1 = new_cache[idx * SEQ_LEN : (idx + 1) * SEQ_LEN].read()
+                elems.append(arr1)
+
+        for elem in elems:
+            elem.result()
+
+
+async def bench_new_cache_serial_tokenseq(exemplar, new_cache_path):
+    ensure_cache(new_cache_path)
+    cache = TreeCache.load(new_cache_path, exemplar)
+
+    ds = TokenSeqDataset(cache, SEQ_LEN)
+
+    num_batches = await ds.async_len()
+
+    for b in range(BATCHES):
+        indices = []
+        for j in range(BS):
+            idx = b * BS + j
+            idx = idx % num_batches
+            indices.append(idx)
+        elems = await ds.get_batch(indices)
+        del elems
+
+
+async def bench_new_cache_permutation_random(exemplar, new_cache_path):
+    ensure_cache(new_cache_path)
+    cache = TreeCache.load(new_cache_path, exemplar)
+
+    ds = TokenSeqDataset(cache, SEQ_LEN)
+    ds = PermutationDataset(ds, jax.random.PRNGKey(0))
+
+    num_batches = await ds.async_len()
+
+    for b in range(BATCHES):
+        indices = []
+        for j in range(BS):
+            idx = b * BS + j
+            idx = idx % num_batches
+            indices.append(idx)
+        elems = await ds.get_batch(indices)
+        del elems
+
+
+def ensure_cache(new_cache_path):
+    if not fsspec_utils.exists(os.path.join(new_cache_path, LEDGER_FILE_NAME)):
+        ledger = CacheLedger(100000, {}, True)
+        _serialize_json_and_commit(os.path.join(new_cache_path, LEDGER_FILE_NAME), ledger)
+
+
+if __name__ == "__main__":
+    import sys
+
+    if not len(sys.argv) == 3:
+        print("Usage: convert_to_new_cache.py old_cache_path new_cache_path")
+        sys.exit(1)
+
+    for split in ["validation", "train"]:
+        print(f"Split: {split}", flush=True)
+        in_path = os.path.join(sys.argv[1], split)
+        out_path = os.path.join(sys.argv[2], split)
+        # convert_to_new_cache(in_path, out_path)
+        # with capture_time() as time_fn:
+        #     bench_old_cache(in_path)
+        # tokens_per_second = SEQ_LEN * BS * BATCHES / time_fn()
+        # print(f"Old Cache: {time_fn()} ({tokens_per_second} tps)", flush=True)
+
+        exemplar = {"input_ids": np.zeros((SEQ_LEN,), dtype=np.int32)}
+
+        with capture_time() as time_fn:
+            bench_new_cache_serial(exemplar, out_path)
+        tokens_per_second = SEQ_LEN * BS * BATCHES / time_fn()
+        print(f"New Cache Serial: {time_fn()} ({tokens_per_second} tps)", flush=True)
+
+        with capture_time() as time_fn:
+            asyncio.run(bench_new_cache_serial_tokenseq(exemplar, out_path))
+        tokens_per_second = SEQ_LEN * BS * BATCHES / time_fn()
+
+        print(f"New Cache Serial TokenSeq: {time_fn()} ({tokens_per_second} tps)", flush=True)
+
+        with capture_time() as time_fn:
+            bench_new_cache_random(exemplar, out_path)
+        tokens_per_second = SEQ_LEN * BS * BATCHES / time_fn()
+
+        print(f"New Cache Random: {time_fn()} ({tokens_per_second} tps)", flush=True)
+
+        with capture_time() as time_fn:
+            asyncio.run(bench_new_cache_permutation_random(exemplar, out_path))
+        tokens_per_second = SEQ_LEN * BS * BATCHES / time_fn()
+
+        print(f"New Cache Permutation: {time_fn()} ({tokens_per_second} tps)", flush=True)
diff --git a/src/levanter/store/tree_store.py b/src/levanter/store/tree_store.py
new file mode 100644
index 000000000..0b1e93bff
--- /dev/null
+++ b/src/levanter/store/tree_store.py
@@ -0,0 +1,237 @@
+import asyncio
+import os
+from typing import Generic, List, TypeVar
+
+import jax
+import jax.numpy as jnp
+import jax.tree_util as jtu
+import numpy as np
+from jaxtyping import PyTree
+
+from haliax.jax_utils import is_jax_array_like
+
+from .jagged_array import JaggedArrayStore
+
+
+T = TypeVar("T", bound=PyTree)
+
+
+# TODO at some point if we turn this into a real library, it would be nice to store the schema
+# TODO: some data is probably best not stored as a jagged array, but as a flat array?
+# TODO: also sometimes we might want a rowstore actually
+
+
+def heuristic_is_leaf(x):
+    if isinstance(x, list):
+        return jnp.isscalar(x[0])
+    else:
+        return False
+
+
+def heuristic_is_leaf_batched(x):
+    if isinstance(x, list):
+        return jnp.isscalar(x[0]) or is_jax_array_like(x[0])
+    else:
+        return False
+
+
+class TreeStore(Generic[T]):
+    """
+    A TreeStoreBuilder stores batched data as a tree of ragged arrays.
+    """
+
+    path: str
+    mode: str
+    tree: PyTree[JaggedArrayStore]
+
+    def __init__(self, tree, path: str, mode: str):
+        self.path = path
+        self.mode = mode
+        self.tree = tree
+
+    @staticmethod
+    def open(exemplar: T, path: str, *, mode="a") -> "TreeStore":
+        """
+        Open a TreeStoreBuilder from a file.
+        """
+        tree = _construct_builder_tree(exemplar, path, mode)
+        return TreeStore(tree, path, mode)
+
+    def append(self, ex: T):
+        return self.extend([ex])
+
+    def extend(self, batch: List[T]):
+        """
+        Append a batch of data to the store.
+        """
+        # TODO: I do wish zarr supported async
+        jtu.tree_map(
+            lambda writer, *xs: writer.extend([np.asarray(x) for x in xs]),
+            self.tree,
+            *batch,
+            is_leaf=heuristic_is_leaf,
+        )
+
+    def extend_with_batch(self, batch: T):
+        """
+        Append a batch of data (as a pytree with batched leaves) to the store.
+
+        This method works only when the "leaves" are lists of numpy arrays or scalars.
+        For instance, HF's BatchEncoding is a dict of lists of numpy arrays.
+        """
+        jtu.tree_map(
+            lambda writer, xs: writer.extend([np.asarray(x) for x in xs]),
+            self.tree,
+            batch,
+            is_leaf=heuristic_is_leaf_batched,
+        )
+
+    async def extend_with_batch_async(self, batch: T):
+        """
+        Append a batch of data (as a pytree with batched leaves) to the store.
+
+        This method works only when the "leaves" are lists of numpy arrays or scalars.
+        For instance, HF's BatchEncoding is a dict of lists of numpy arrays.
+        """
+        futures = jtu.tree_map(
+            lambda writer, xs: writer.extend_async([np.asarray(x) for x in xs]),
+            self.tree,
+            batch,
+            is_leaf=heuristic_is_leaf_batched,
+        )
+
+        await asyncio.gather(*jax.tree_leaves(futures))
+
+    def trim_to_size(self, size: int):
+        """
+        Trim the store to a given size.
+        """
+        # TODO These all return ts Futures so in theory we could await them all at once
+        jtu.tree_map(lambda writer: writer.trim_to_size(size), self.tree, is_leaf=heuristic_is_leaf)
+
+    async def trim_to_size_async(self, size: int):
+        """
+        Trim the store to a given size.
+        """
+        futures = jtu.tree_map(lambda writer: writer.trim_to_size_async(size), self.tree, is_leaf=heuristic_is_leaf)
+        leaves, structure = jax.tree_flatten(futures)
+
+        await asyncio.gather(*leaves)
+
+    def reload(self) -> "TreeStore":
+        """
+        Close the builder and return a TreeStore.
+        """
+        tree = jtu.tree_map(lambda builder: builder.reload(), self.tree, is_leaf=heuristic_is_leaf)
+        return TreeStore(tree, self.path, self.mode)
+
+    def __len__(self):
+        if self.tree is None:
+            return 0
+        else:
+            return len(jax.tree.leaves(self.tree)[0])
+
+    async def get_batch(self, indices) -> List[T]:
+        grouped = jtu.tree_map(lambda reader: reader.get_batch(indices), self.tree, is_leaf=heuristic_is_leaf)
+
+        leaves, structure = jtu.tree_flatten(grouped, is_leaf=heuristic_is_leaf)
+
+        awaited_leaves = await asyncio.gather(*leaves)
+        return [jtu.tree_unflatten(structure, [leaf[i] for leaf in awaited_leaves]) for i in range(len(indices))]
+
+    def __getitem__(self, item):
+        if self.tree is None:
+            raise IndexError("No data in store")
+        elif isinstance(item, slice):
+            # debatch
+            leaves, structure = jax.tree.flatten(self.tree, is_leaf=heuristic_is_leaf)
+            # batched_items = jtu.tree_map(lambda reader: reader[item], self.tree, is_leaf=heuristic_is_leaf)
+            batched_item_leaves = [leaf[item] for leaf in leaves]
+            num_items = len(leaves[0])
+            return [jtu.tree_unflatten(structure, [leaf[i] for leaf in batched_item_leaves]) for i in range(num_items)]
+        else:
+            return jtu.tree_map(lambda reader: reader[item], self.tree, is_leaf=heuristic_is_leaf)
+
+    def __iter__(self):
+        if self.tree is None:
+            return
+        else:
+            for i in range(len(self)):
+                yield self[i]
+
+    def get_batch_sync(self, indices) -> List[T]:
+        # TODO: would be better to batch these up
+        grouped = jtu.tree_map(lambda reader: reader.get_batch_sync(indices), self.tree, is_leaf=heuristic_is_leaf)
+
+        out = [jtu.tree_map(lambda _, leaf: leaf[i], self.tree, grouped) for i in range(len(indices))]
+
+        return out
+
+
+def _construct_builder_tree(exemplar, path, mode):
+    def open_builder(tree_path, item):
+        item = np.asarray(item)
+        rank = item.ndim
+        render_tree_path = "/".join(_render_path_elem(x) for x in tree_path)
+        return JaggedArrayStore.open(os.path.join(path, render_tree_path), mode=mode, item_rank=rank, dtype=item.dtype)
+
+    return jtu.tree_map_with_path(open_builder, exemplar, is_leaf=heuristic_is_leaf)
+
+
+def _render_path_elem(x):
+    match x:
+        case jtu.DictKey(key):
+            return f"{key}"
+        case jtu.GetAttrKey(key):
+            return f"{key}"
+        case jtu.SequenceKey(i):
+            return f"{i}"
+        case jtu.FlattenedIndexKey(i):
+            return f"{i}"
+        case _:
+            return str(x)
+
+
+# class TokenSeqDataset:
+#     """
+#     A dataset of sequences of tokens of fixed length, materialized from a collection of JaggedArrayStores,
+#     which have typically much longer sequences. This class takes consecutive sequences of tokens from the builders
+#     and slices/concats them to form the dataset.
+#     """
+#
+#     def __init__(
+#         self, token_arrays: Sequence[JaggedArrayStore], token_counts: Sequence[int], seq_len: int, pad_token: int
+#     ):
+#         self.token_arrays = token_arrays
+#
+#         def _round_to_nearest_multiple(x, y):
+#             return x + y - x % y
+#
+#         token_counts_padded = np.array([_round_to_nearest_multiple(x, seq_len) for x in token_counts])
+#         seq_counts = token_counts_padded // seq_len
+#         self.seq_counts_cumsum = np.concatenate([np.asarray([0]), np.cumsum(seq_counts)])
+#
+#         self.seq_len = seq_len
+#         self.pad_token = pad_token
+#
+#     def __len__(self):
+#         return self.seq_counts_cumsum[-1]
+#
+#     def __getitem__(self, seq_id):
+#         return asyncio.run(self.get_item_async(seq_id))
+#
+#     async def get_item_async(self, seq_id):
+#         # TODO: accept slices and such?
+#         shard_id = np.searchsorted(self.seq_counts_cumsum, seq_id, side="right") - 1
+#         shard_start = self.seq_counts_cumsum[shard_id]
+#         shard_end = self.seq_counts_cumsum[shard_id + 1]
+#         shard_seq_id = seq_id - shard_start
+#
+#         shard_seq_start = shard_seq_id * self.seq_len
+#         shard_seq_end = min((shard_seq_id + 1) * self.seq_len, self.token_arrays[shard_id].data_size)
+#
+#         shard_seq = await self.token_arrays[shard_id].data[shard_seq_start:shard_seq_end].read()
+#         pad_len = self.seq_len - (shard_seq_end - shard_seq_start)
+#         padded_seq = np.concatenate([shard_seq, np.full(pad_len, self.pad_token, dtype=shard_seq.dtype)])
+#
+#         return padded_seq
diff --git a/src/levanter/tracker/wandb.py b/src/levanter/tracker/wandb.py
index 1b0254261..1e95c0d3a 100644
--- a/src/levanter/tracker/wandb.py
+++ b/src/levanter/tracker/wandb.py
@@ -155,7 +155,7 @@ def init(self, run_id: Optional[str]) -> WandbTracker:
         if jax.process_count() > 1:
             # we need to share wandb run information across all hosts, because we use it for checkpoint paths and things
             metadata_to_share = dict(
-                entity=r.entity,
+                # entity=r.entity,
                 project=r.project,
                 name=r.name,
                 tags=r.tags,
@@ -166,10 +166,10 @@ def init(self, run_id: Optional[str]) -> WandbTracker:
                 metadata_to_share, is_source=jax.process_index() == 0
             )
 
-            if jax.process_index() != 0:
-                assert r.mode == "disabled"
-                for k, v in metadata_to_share.items():
-                    setattr(r, k, v)
+            # if jax.process_index() != 0:
+            # assert r.mode == "disabled", f"Only the primary worker should be using wandb. Got {r.mode}"
+            # for k, v in metadata_to_share.items():
+            #     setattr(r, k, v)
 
             logger.info(f"Synced wandb run information from process 0: {r.name} {r.id}")
 
diff --git a/src/levanter/trainer.py b/src/levanter/trainer.py
index ef870382b..69c932cd9 100644
--- a/src/levanter/trainer.py
+++ b/src/levanter/trainer.py
@@ -48,7 +48,7 @@
 from levanter import tracker
 from levanter.checkpoint import CheckpointerConfig, load_checkpoint_or_initialize
 from levanter.config import JsonAtom
-from levanter.data import Dataset, ReplicatedBatchLoader, ShardableDataset, ShardedBatchLoader
+from levanter.data import AsyncDataset, DataLoader
 from levanter.distributed import DistributedConfig, RayConfig
 from levanter.grad_accum import microbatched
 from levanter.tracker import TrackerConfig, capture_time
@@ -433,7 +433,7 @@ def _add_default_hooks(self):
     def add_eval_hook(self, eval_dataset, name: Optional[str] = None):
         from levanter import callbacks
 
-        eval_loader = self.replicated_loader(eval_dataset, self.EvalBatch)
+        eval_loader = self.data_loader(eval_dataset, self.EvalBatch)
 
         if eval_loader and (self.config.max_eval_batches is None or self.config.max_eval_batches > 0):
 
@@ -450,31 +450,24 @@ def eval_loss(model, *batch, **batch_kwargs):
                 every=self.config.steps_per_eval,
             )
 
-    def replicated_loader(self, dataset: Dataset[X], batch_axis: Axis) -> ReplicatedBatchLoader[X]:
-        """Creates a replicated batch loader for the given dataset. Generally you should use this
-        if you either be able to make a single pass over the dataset.
+    def data_loader(self, dataset: AsyncDataset[X], batch_axis: Axis) -> DataLoader[X]:
+        """Creates a data loader for the given dataset and batch axis.
 
         Args:
-            dataset (Dataset): the dataset to load
+            dataset (AsyncDataset): the dataset to load
             batch_axis (Axis): the batch axis
 
         Returns:
-            ReplicatedBatchLoader: the batch loader
+            DataLoader: the data loader
         """
-        return ReplicatedBatchLoader(dataset, self.device_mesh, batch_axis, self.compute_axis_mapping)
-
-    def sharded_loader(self, dataset: ShardableDataset[X], batch_axis: Axis) -> ShardedBatchLoader[X]:
-        """Creates a sharded batch loader for the given dataset. Generally you should use this
-        for training and you don't care about epoch boundaries.
-
-        Args:
-            dataset (Dataset): the dataset to load
-            batch_axis (Axis): the batch axis
-
-        Returns:
-            ShardedBatchLoader: the batch loader
-        """
-        return ShardedBatchLoader(dataset, self.device_mesh, batch_axis, self.compute_axis_mapping)
+        return DataLoader(
+            batch_axis,
+            dataset,
+            max_buffered_batches=128,
+            mesh=self.device_mesh,
+            axis_resources=self.compute_axis_mapping,
+            prefetch_size=32,
+        )
 
     @cached_property
     def _jit_train_step_fn(self):
diff --git a/src/levanter/utils/background_iterable.py b/src/levanter/utils/background_iterable.py
index 6bb200873..84c5a7789 100644
--- a/src/levanter/utils/background_iterable.py
+++ b/src/levanter/utils/background_iterable.py
@@ -1,7 +1,8 @@
+import asyncio
 import queue
 import sys
 import threading
-from typing import Callable, Iterable, Iterator, Optional, TypeVar
+from typing import AsyncIterator, Callable, Iterable, Iterator, Optional, TypeVar, Union
 
 import tblib
 
@@ -18,27 +19,41 @@ class BackgroundIterable(Iterable[Ex]):
     like running XLA kernels...
     """
 
-    def __init__(self, producer_fn: Callable[[], Iterator[Ex]], max_capacity: Optional[int] = None):
+    def __init__(
+        self,
+        producer_fn: Callable[[], Union[Iterator[Ex], AsyncIterator[Ex]]],
+        max_capacity: Optional[int] = None,
+    ):
         self.max_capacity = max_capacity
-        self._stop_event = threading.Event()
         self._producer_fn = producer_fn
 
     def __iter__(self):
-        if self._stop_event.is_set():
-            raise RuntimeError("Cannot iterate over a stopped BackgroundIterable")
+        return BackgroundIterator(self._producer_fn, self.max_capacity)
+
 
-        q = queue.Queue(self.max_capacity)
-        thread = threading.Thread(target=self._fill_queue_with_batches, args=(q,))
-        thread.daemon = True
-        thread.start()
+class BackgroundIterator(Iterator[Ex]):
+    def __init__(self, producer_fn: Callable[[], Union[Iterator[Ex], AsyncIterator[Ex]]], max_capacity: Optional[int]):
+        self.max_capacity = max_capacity
+        self._producer_fn = producer_fn
+        self._stop_event = threading.Event()
+        self.q: queue.Queue = queue.Queue(self.max_capacity or 0)
+        self.thread = threading.Thread(target=self._fill_queue_with_batches)
+        self.thread.daemon = True
+        self.thread.start()
+
+    def __iter__(self):
+        return self
 
+    def __next__(self):
         while not self._stop_event.is_set():
-            batch = q.get()
+            batch = self.q.get()
             if batch is _SENTINEL:
-                break
+                raise StopIteration
             elif isinstance(batch, _ExceptionWrapper):
                 batch.reraise()
-            yield batch
+            return batch
+
+        raise StopIteration
 
     def __del__(self):
         self.stop()
@@ -46,13 +61,44 @@ def __del__(self):
     def stop(self):
         self._stop_event.set()
 
-    def _fill_queue_with_batches(self, q):
+    def _fill_queue_with_batches(self):
+        try:
+            iterator = self._producer_fn()
+            if isinstance(iterator, Iterator):
+                self._produce_batches_sync(iterator)
+            else:
+                asyncio.run(self._produce_batches_async(iterator))
+        except Exception:
+            self.q.put(_ExceptionWrapper(sys.exc_info()))
+
+    def _produce_batches_sync(self, iterator):
+        try:
+            for batch in iterator:
+                while not self._stop_event.is_set():
+                    try:
+                        self.q.put(batch, block=True, timeout=1)
+                        break
+                    except queue.Full:
+                        pass
+
+                if self._stop_event.is_set():
+                    break
+
+            while not self._stop_event.is_set():
+                try:
+                    self.q.put(_SENTINEL, block=True, timeout=1)
+                    break
+                except queue.Full:
+                    pass
+        except Exception:
+            self.q.put(_ExceptionWrapper(sys.exc_info()))
+
+    async def _produce_batches_async(self, iterator):
         try:
-            for batch in self._producer_fn():
-                # we don't want to block forever because then we can't stop the thread
+            async for batch in iterator:
                 while not self._stop_event.is_set():
                     try:
-                        q.put(batch, block=True, timeout=1)
+                        self.q.put(batch, block=True, timeout=1)
                         break
                     except queue.Full:
                         pass
@@ -62,13 +108,12 @@ def _fill_queue_with_batches(self, q):
 
             while not self._stop_event.is_set():
                 try:
-                    q.put(_SENTINEL, block=True, timeout=1)
+                    self.q.put(_SENTINEL, block=True, timeout=1)
                     break
                 except queue.Full:
-                    # don't hold up the thread if we can't put the sentinel
                     pass
-        except Exception:  # flake8: noqa
-            q.put(_ExceptionWrapper(sys.exc_info()))
+        except Exception:
+            self.q.put(_ExceptionWrapper(sys.exc_info()))
 
 
 class _Sentinel:
diff --git a/src/levanter/utils/fsspec_utils.py b/src/levanter/utils/fsspec_utils.py
index c6adeb3e4..896ea8450 100644
--- a/src/levanter/utils/fsspec_utils.py
+++ b/src/levanter/utils/fsspec_utils.py
@@ -5,3 +5,9 @@ def exists(url, **kwargs) -> bool:
     """Check if a file exists on a remote filesystem."""
     fs, path = fsspec.core.url_to_fs(url, **kwargs)
     return fs.exists(path)
+
+
+def mkdirs(path):
+    """Create a directory and any necessary parent directories."""
+    fs, path = fsspec.core.url_to_fs(path)
+    fs.makedirs(path, exist_ok=True)
diff --git a/src/levanter/utils/index.py b/src/levanter/utils/index.py
new file mode 100644
index 000000000..3e94ab9fb
--- /dev/null
+++ b/src/levanter/utils/index.py
@@ -0,0 +1,46 @@
+from typing import Generic, Iterable, Iterator, TypeVar
+
+
+T = TypeVar("T")
+
+
+class Index(Generic[T]):
+    """
+    Index is a bidirectional mapping from (incremental) integers to objects.
+
+    Needs to be fast, so it exposes the underlying data structures.
+    """
+
+    def __init__(self, objs: Iterable[T] = ()):
+        self._index_to_obj: list[T] = []
+        self._obj_to_index: dict[T, int] = {}
+        for obj in objs:
+            self.append(obj)
+
+    def __len__(self):
+        return len(self._index_to_obj)
+
+    def __getitem__(self, index: int) -> T:
+        return self._index_to_obj[index]
+
+    def __setitem__(self, index: int, obj: T):
+        self._index_to_obj[index] = obj
+        self._obj_to_index[obj] = index
+
+    def append(self, obj: T) -> int:
+        index = len(self)
+        self._index_to_obj.append(obj)
+        self._obj_to_index[obj] = index
+        return index
+
+    def get_index(self, obj: T) -> int:
+        return self._obj_to_index[obj]
+
+    def get_obj(self, index: int) -> T:
+        return self._index_to_obj[index]
+
+    def __contains__(self, obj: T) -> bool:
+        return obj in self._obj_to_index
+
+    def __iter__(self) -> Iterator[T]:
+        return iter(self._index_to_obj)
diff --git a/src/levanter/utils/jax_utils.py b/src/levanter/utils/jax_utils.py
index d159d7948..1d7205365 100644
--- a/src/levanter/utils/jax_utils.py
+++ b/src/levanter/utils/jax_utils.py
@@ -41,7 +41,9 @@ def use_cpu_device():
 def local_cpu_mesh():
     """Temporarily sets the default device to CPU"""
     cpu = jax.local_devices(backend="cpu")[0]
-    mesh = jax.sharding.Mesh(np.array([cpu]).reshape(1, 1), ("data", "model"))
+    mesh = jax.sharding.Mesh(
+        np.array([cpu]).reshape(1, 1, 1), (ResourceAxis.REPLICA, ResourceAxis.DATA, ResourceAxis.MODEL)
+    )
     with use_cpu_device(), mesh:
         yield mesh
 
diff --git a/src/levanter/utils/py_utils.py b/src/levanter/utils/py_utils.py
index 5262aa75d..a796dd6af 100644
--- a/src/levanter/utils/py_utils.py
+++ b/src/levanter/utils/py_utils.py
@@ -1,4 +1,3 @@
-import asyncio
 import os
 import sys
 from dataclasses import dataclass
@@ -182,9 +181,3 @@ def actual_sizeof(obj):
                 need_to_see.extend(obj)
         objects = need_to_see
     return size
-
-
-def future_from_value(value):
-    future = asyncio.Future()
-    future.set_result(value)
-    return future
diff --git a/src/levanter/utils/ray_utils.py b/src/levanter/utils/ray_utils.py
index 255968815..8a299720e 100644
--- a/src/levanter/utils/ray_utils.py
+++ b/src/levanter/utils/ray_utils.py
@@ -85,9 +85,11 @@ def _child_failed(self, child: ray.actor.ActorHandle, exception: ExceptionInfo):
 
 
 @contextlib.contextmanager
-def log_failures_to(parent):
+def log_failures_to(parent, suppress=False):
     # parent is actorref of SnitchRecipient
     try:
         yield
     except Exception as e:
         parent._child_failed.remote(current_actor_handle(), ser_exc_info(e))
+        if not suppress:
+            raise e
diff --git a/src/levanter/utils/thread_utils.py b/src/levanter/utils/thread_utils.py
new file mode 100644
index 000000000..9c6e2ef36
--- /dev/null
+++ b/src/levanter/utils/thread_utils.py
@@ -0,0 +1,28 @@
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+
+
+# Create a ThreadPoolExecutor
+_executor = ThreadPoolExecutor(max_workers=10)
+
+
+def blocking_wait(coro):
+    """
+    This will only work if there are fewer than 10 levels of nested coroutines...
+    """
+    try:
+        loop = asyncio.get_running_loop()
+    except RuntimeError:
+        loop = None
+
+    if loop is not None and loop.is_running():
+        future = _executor.submit(lambda: asyncio.run(coro))
+        return future.result()
+    else:
+        return asyncio.run(coro)
+
+
+def future_from_value(value):
+    future = asyncio.Future()
+    future.set_result(value)
+    return future
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_audio.py b/tests/test_audio.py
index c9ae0d494..8d3015431 100644
--- a/tests/test_audio.py
+++ b/tests/test_audio.py
@@ -1,7 +1,11 @@
+import tempfile
+
+import pytest
 from datasets import load_dataset
-from transformers import AutoProcessor
+from transformers import AutoProcessor, AutoTokenizer
 
 from levanter.data.audio import AudioDatasetSourceConfig, AudioIODatasetConfig, BatchAudioProcessor
+from levanter.store.cache import SerialCacheWriter
 from test_utils import skip_if_hf_model_not_accessible, skip_if_no_soundlibs
 
 
@@ -9,8 +13,9 @@
 @skip_if_hf_model_not_accessible("openai/whisper-tiny")
 def test_whisper_batch_processor():
     processor = AutoProcessor.from_pretrained("openai/whisper-tiny")
+    tokenizer = AutoTokenizer.from_pretrained("openai/whisper-tiny")
     ds = load_dataset("WillHeld/test_librispeech_parquet", split="validation").select_columns(["audio", "text"])
-    batch_processor = BatchAudioProcessor(processor)
+    batch_processor = BatchAudioProcessor(processor, tokenizer)
     inputs = [(audio["array"], audio["sampling_rate"], text) for audio, text in zip(ds[:16]["audio"], ds[:16]["text"])]
     batch_processor(inputs)
 
@@ -37,12 +42,41 @@ def test_hf_audio_loading_source():
 
 @skip_if_no_soundlibs
 @skip_if_hf_model_not_accessible("openai/whisper-tiny")
-def test_hf_audio_ray_pipeline():
+@pytest.mark.asyncio
+async def test_hf_audio_ray_pipeline():
+    # Use the Real Librispeech Valudation. Testing one doesn't support streaming.
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ac = AudioIODatasetConfig(
+            cache_dir=str(tmpdir), id="WillHeld/test_librispeech_parquet", text_key="text", max_length=1024
+        )
+        validation = ac.validation_set()
+        for i in range(10):
+            t = (await validation.get_batch([i]))[0]
+            assert t["input_features"].shape == (80, 3000), t["input_features"].shape
+            assert t["input_ids"].shape == (1024,), t["input_ids"].shape
+            assert t["attention_mask"].shape == (1024,), t["attention_mask"].shape
+
+
+@skip_if_no_soundlibs
+@skip_if_hf_model_not_accessible("openai/whisper-tiny")
+def test_hf_audio_serial_cache():
     # Use the Real Librispeech Valudation. Testing one doesn't support streaming.
     ac = AudioIODatasetConfig(id="WillHeld/test_librispeech_parquet", text_key="text")
-    audio_iterator = iter(ac.validation_set(batch_size=10))
-    for i in range(10):
-        t = next(audio_iterator)
-        assert t["input_features"].shape == (80, 3000), t["input_features"].shape
-        assert t["input_ids"].shape == (1024,), t["input_ids"].shape
-        assert t["attention_mask"].shape == (1024,), t["attention_mask"].shape
+
+    processor = AutoProcessor.from_pretrained("openai/whisper-tiny")
+    tokenizer = AutoTokenizer.from_pretrained("openai/whisper-tiny")
+    batch_processor = BatchAudioProcessor(processor, tokenizer, max_length=1024)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with SerialCacheWriter(tmpdir, batch_processor.output_exemplar) as writer:
+            for i, ex in enumerate(ac.get_shard_source("validation")):
+                writer.write_batch(batch_processor([ex]))
+                if i > 10:
+                    break
+
+        cache = writer.result()
+
+        for ex in cache.get_batch_sync(list(range(10))):
+            assert ex["input_features"].shape == (80, 3000), ex["input_features"].shape
+            assert ex["input_ids"].shape == (1024,), ex["input_ids"].shape
+            assert ex["attention_mask"].shape == (1024,), ex["attention_mask"].shape
diff --git a/tests/test_background_iterable.py b/tests/test_background_iterable.py
index ad768288c..0da8d6ea6 100644
--- a/tests/test_background_iterable.py
+++ b/tests/test_background_iterable.py
@@ -1,3 +1,5 @@
+import asyncio
+
 import pytest
 
 from levanter.utils.background_iterable import BackgroundIterable
@@ -55,10 +57,76 @@ def ongoing_process():
     for _ in range(5):
         next(iter1)
 
-    background_iterable.stop()
+    iter1.stop()
 
     # Try to get another item from the iterator (should raise StopIteration)
     # there's a bit of a race so we give it 2 tries, which is enough for the test
     with pytest.raises(StopIteration):
         next(iter1)
         next(iter1)
+
+
+@pytest.mark.asyncio
+async def test_async_reentrancy():
+    async def async_producer():
+        for i in range(1, 101):
+            yield i
+            await asyncio.sleep(0.01)
+
+    background_iterable = BackgroundIterable(async_producer, max_capacity=10)
+
+    iter1 = iter(background_iterable)
+    iter2 = iter(background_iterable)
+
+    data1 = [item for item in iter1]
+    data2 = [item for item in iter2]
+
+    assert data1 == data2
+    assert data1 == list(range(1, 101))
+
+
+@pytest.mark.asyncio
+async def test_async_empty_iteration():
+    async def async_producer():
+        if False:
+            yield
+
+    background_iterable = BackgroundIterable(async_producer, max_capacity=10)
+
+    data = list(background_iterable)
+
+    assert data == []
+
+
+@pytest.mark.asyncio
+async def test_async_exception_handling():
+    async def async_producer_with_exception():
+        raise ValueError("Something went wrong!")
+        yield 0  # have to make sure it's an async coroutine
+
+    background_iterable = BackgroundIterable(async_producer_with_exception, max_capacity=10)
+
+    with pytest.raises(ValueError):
+        for _ in background_iterable:
+            pass
+
+
+@pytest.mark.asyncio
+async def test_async_stop_event():
+    async def ongoing_async_process():
+        while True:
+            for item in range(1, 101):
+                yield item
+
+    background_iterable = BackgroundIterable(ongoing_async_process, max_capacity=10)
+
+    iter1 = iter(background_iterable)
+
+    for _ in range(5):
+        next(iter1)
+
+    iter1.stop()
+
+    with pytest.raises(StopIteration):
+        await next(iter1)
+        await next(iter1)
diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py
index 306bec9cd..f5ce0f774 100644
--- a/tests/test_checkpoint.py
+++ b/tests/test_checkpoint.py
@@ -1,7 +1,6 @@
 import dataclasses
 import datetime
 import pathlib
-import sys
 import tempfile
 from datetime import timedelta
 
@@ -281,12 +280,6 @@ def init_fn(key):
             jax.tree_util.tree_leaves(arrays_only(loaded2)),
         )
 
-        print(jax.tree_util.tree_leaves(loaded), file=sys.stderr)
-        print("M1", file=sys.stderr)
-        print(jax.tree_util.tree_leaves(model1), file=sys.stderr)
-        print("M0", file=sys.stderr)
-        print(jax.tree_util.tree_leaves(model0), file=sys.stderr)
-
         assert_trees_all_equal(
             jax.tree_util.tree_leaves(arrays_only(eqx.filter(loaded, is_checkpointed))),
             jax.tree_util.tree_leaves(arrays_only(eqx.filter(model0, is_checkpointed))),
diff --git a/tests/test_data_mixture.py b/tests/test_data_mixture.py
deleted file mode 100644
index 2410a7d5f..000000000
--- a/tests/test_data_mixture.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import tempfile
-
-import tiny_test_corpus
-from levanter.data import Dataset
-from levanter.data.mixture import MixtureDataset, StopStrategy
-from levanter.data.text import TokenSeqDataset
-
-
-class ListDataset(Dataset[list]):
-    def __init__(self, data: list):
-        self.data = data
-
-    def __iter__(self):
-        return iter(self.data)
-
-
-def test_stop_strategies():
-    seq_len = 10
-
-    num_docs_1, num_docs_2 = 10, 20
-    with tempfile.TemporaryDirectory() as tmpdir:
-        # source_1 = SingleShardDocumentSource(docs_1)
-        data_config, _ = tiny_test_corpus.construct_small_data_cache(
-            f"{tmpdir}/cache_1", num_shards=1, chunk_size=num_docs_1, doc_len=seq_len
-        )
-
-        data_config, _ = tiny_test_corpus.construct_small_data_cache(
-            f"{tmpdir}/cache_2", num_shards=1, chunk_size=num_docs_2, doc_len=seq_len
-        )
-
-        ds1 = TokenSeqDataset.load(seq_len, f"{tmpdir}/cache_1/cache/train")
-        ds2 = TokenSeqDataset.load(seq_len, f"{tmpdir}/cache_2/cache/train")
-
-        # set reuseable config
-        datasets = {"1": ds1, "2": ds2}
-        # test mixture with all weights on one dataset
-        mixture_1_only = MixtureDataset(
-            datasets=datasets,
-            weights={"1": 1.0, "2": 0.0},
-            stop_strategy=StopStrategy.FIRST_STOP_STRATEGY,
-            key=0,
-        )
-        counter = 0
-        for batch in mixture_1_only:
-            assert batch.shape == (seq_len,)
-            counter += 1
-        assert counter == 10
-
-        # compare mixture with different strategies
-        mixture_balanced_first = MixtureDataset(
-            datasets=datasets,
-            weights={"1": 0.5, "2": 0.5},
-            stop_strategy=StopStrategy.FIRST_STOP_STRATEGY,
-            key=0,
-        )
-        counter_first = sum([1 for _ in mixture_balanced_first])
-
-        mixture_balanced_all = MixtureDataset(
-            datasets=datasets,
-            weights={"1": 0.5, "2": 0.5},
-            stop_strategy=StopStrategy.ALL_STOP_STRATEGY,
-            key=0,
-        )
-        counter_all = sum([1 for _ in mixture_balanced_all])
-        assert counter_first < counter_all
-
-        # test normalized weights
-        mixture_normalized = MixtureDataset(
-            datasets=datasets,
-            weights={"1": 2.0, "2": 2.0},
-            stop_strategy=StopStrategy.FIRST_STOP_STRATEGY,
-            key=0,
-        )
-        assert mixture_normalized.weights["1"] == mixture_normalized.weights["2"] == 0.5
-
-
-def test_restart_strategy_gets_the_right_average():
-
-    num_docs_1, num_docs_2 = 10, 20
-    ds1 = ListDataset([1 for _ in range(num_docs_1)])
-    ds2 = ListDataset([2 for _ in range(num_docs_2)])
-
-    datasets = {"1": ds1, "2": ds2}
-    mixture_balanced_restart = MixtureDataset(
-        datasets=datasets,  # type: ignore
-        weights={"1": 0.6, "2": 0.4},
-        stop_strategy=StopStrategy.RESTART_STRATEGY,
-        key=0,
-    )
-
-    # ensure we get the right long run average
-    NUM_SAMPLES = 2300
-
-    # variance of a bernoulli distribution is p(1-p) ≈ 0.24
-    # to get a 95% confidence interval of 0.02, we need ~2300 samples
-
-    # we expect to get roughly 60% 1s and 40% 2s
-    num_ones = 0
-    for i, ex in enumerate(mixture_balanced_restart):
-        if ex == 1:
-            num_ones += 1
-        if i >= NUM_SAMPLES:
-            break
-
-    assert 0.58 < num_ones / NUM_SAMPLES < 0.62
-
-    # now just to verify, stop_first won't give us the same average
-
-    num_total = 0
-    num_ones = 0
-
-    mixture_balanced_first = MixtureDataset(
-        datasets=datasets,  # type: ignore
-        weights={"1": 0.6, "2": 0.4},
-        stop_strategy=StopStrategy.FIRST_STOP_STRATEGY,
-        key=0,
-    )
-
-    for i, ex in enumerate(mixture_balanced_first):
-        if ex == 1:
-            num_ones += 1
-        num_total += 1
-
-    assert num_total < 30
-    assert num_ones == num_docs_1
-    assert num_ones / num_total < 0.55 or num_ones / num_total > 0.65
diff --git a/tests/test_doremi.py b/tests/test_doremi.py
index 8f10139b0..8600c9c8b 100644
--- a/tests/test_doremi.py
+++ b/tests/test_doremi.py
@@ -1,6 +1,10 @@
+import functools
+from typing import Optional, Sequence
+
 import equinox
 import jax
 import jax.random
+import numpy as np
 import optax
 import pytest
 
@@ -8,7 +12,7 @@
 
 import levanter.tracker
 from levanter.callbacks import eval_loss_loop
-from levanter.data.dataset import ShardableDataset
+from levanter.data import AsyncDataset
 from levanter.data.mixture import MixtureDataset
 from levanter.trainer import Trainer, TrainerConfig
 from levanter.utils.jax_utils import key_iterator
@@ -23,7 +27,7 @@ class Example(equinox.Module):
 Block = hax.Axis("Block", 1024)
 
 
-class LogitDataset(ShardableDataset[Example]):
+class LogitDataset(AsyncDataset[Example]):
     def __init__(self, W, noise, x_mask, x_bias, *, key):
         self.W = W
         self.noise = noise
@@ -31,18 +35,65 @@ def __init__(self, W, noise, x_mask, x_bias, *, key):
         self.x_bias = x_bias
         self.key = key
 
+        @equinox.filter_jit
+        def _make_example(x_block, y_block, offset):
+            return Example(x=x_block[Block, offset], y=y_block[Block, offset])
+
+        self._make_example = _make_example
+
+        @functools.lru_cache
+        @equinox.filter_jit
+        def _gen_block_data(block_id):
+            key = jax.random.fold_in(self.key, block_id)
+            x_block = hax.random.normal(key, (Block, self.W.axes[0])) * self.x_mask + self.x_bias
+            noise = hax.random.normal(key, (Block,)) * self.noise
+            y_block = (hax.nn.sigmoid(hax.dot(x_block, self.W, axis=self.W.axes[0]) + noise) > 0.5).astype(float)
+            return x_block, y_block
+
+        self._gen_block_data = _gen_block_data
+
     def __iter__(self):
         key_iter = key_iterator(self.key)
         Dim = self.W.axes[0]
         while True:
-            x_block = hax.random.normal(next(key_iter), (Block, Dim)) * self.x_mask + self.x_bias
-            noise = hax.random.normal(next(key_iter), (Block,)) * self.noise
+            kk = next(key_iter)
+            this_key_iter = key_iterator(kk)
+            x_block = hax.random.normal(next(this_key_iter), (Block, Dim)) * self.x_mask + self.x_bias
+            noise = hax.random.normal(next(this_key_iter), (Block,)) * self.noise
             y_block = (hax.nn.sigmoid(hax.dot(x_block, self.W, axis=Dim) + noise) > 0.5).astype(float)
             for i in range(Block.size):
-                yield Example(x=x_block[Block, i], y=y_block[Block, i])
+                yield self._make_example(x_block, y_block, i)
+
+    async def async_len(self) -> int:
+        raise ValueError("Infinitely long dataset")
+
+    async def final_length_is_known(self) -> bool:
+        return False
+
+    def is_finite(self) -> bool:
+        return False
 
-    def shard(self, shard_id: int, num_shards: int):
-        return LogitDataset(self.W, self.noise, self.x_mask, self.x_bias, key=jax.random.fold_in(self.key, shard_id))
+    async def current_len(self) -> Optional[int]:
+        return None
+
+    async def get_batch(self, indices: Sequence[int]) -> Sequence[Example]:
+        blocks = set(i // Block.size for i in indices)
+
+        block_data = {}
+        for block_id in blocks:
+            x_block, y_block = self._gen_block_data(block_id)
+            block_data[block_id] = (x_block, y_block)
+
+        result: list[Example] = []
+        indices = np.array(indices, dtype=int)
+
+        for index in indices:
+            block_id = index // Block.size
+            block_offset = index % Block.size
+            x_block, y_block = block_data[block_id]
+            result.append(self._make_example(x_block, y_block, block_offset))
+
+        return result
 
 
 @pytest.mark.slow
@@ -78,7 +129,7 @@ def compute_loss_fn(model, example, reduction=hax.mean, reduction_axis=None, key
         return hax.nn.binary_cross_entropy_loss(y_pred, example.y, reduction=reduction, reduction_axis=reduction_axis)
 
     tiny_trainer_config = TrainerConfig(
-        num_train_steps=600,
+        num_train_steps=300,
         train_batch_size=Batch.size,
         tracker=(),
         id="kmaklfmaf",
@@ -89,11 +140,11 @@ def compute_loss_fn(model, example, reduction=hax.mean, reduction_axis=None, key
 
     trainer = Trainer(tiny_trainer_config, optimizer, compute_loss_fn)
 
-    def fit_to_dataset(dataset):
+    def fit_to_dataset(dataset: AsyncDataset):
         initial_model = init_model()
         with trainer:
             state = trainer.initial_state(next(keys), model=initial_model)
-            loader = trainer.replicated_loader(dataset, Batch)
+            loader = trainer.data_loader(dataset, Batch)
             loader = non_caching_cycle(loader)
 
             loss = 0.0
@@ -125,19 +176,13 @@ def init_model():
     datasets = {"d1": ds1, "d2": ds2, "d3": ds3}
 
     ref_model, ref_loss = fit_to_dataset(
-        MixtureDataset(datasets, weights={k: 1 / 3.0 for k in datasets.keys()}, key=next(keys))
+        MixtureDataset(datasets, weights={k: 1 / 3.0 for k in datasets.keys()}, key=next(keys), block_size=2048)
     )
 
     # let's see the loss on each dataset
-    l1_ref = eval_loss_loop(
-        compute_loss_fn, ref_model, trainer.replicated_loader(ds1, Batch), max_batches=10, name="d1"
-    )
-    l2_ref = eval_loss_loop(
-        compute_loss_fn, ref_model, trainer.replicated_loader(ds2, Batch), max_batches=10, name="d2"
-    )
-    l3_ref = eval_loss_loop(
-        compute_loss_fn, ref_model, trainer.replicated_loader(ds3, Batch), max_batches=10, name="d3"
-    )
+    l1_ref = eval_loss_loop(compute_loss_fn, ref_model, trainer.data_loader(ds1, Batch), max_batches=10, name="d1")
+    l2_ref = eval_loss_loop(compute_loss_fn, ref_model, trainer.data_loader(ds2, Batch), max_batches=10, name="d2")
+    l3_ref = eval_loss_loop(compute_loss_fn, ref_model, trainer.data_loader(ds3, Batch), max_batches=10, name="d3")
 
     assert l3_ref < l1_ref < l2_ref
 
diff --git a/tests/test_in_progress_sequence.py b/tests/test_in_progress_sequence.py
deleted file mode 100644
index 1b5b6711b..000000000
--- a/tests/test_in_progress_sequence.py
+++ /dev/null
@@ -1,124 +0,0 @@
-import pytest
-
-from levanter.data._process_interleave import InProgressSequence
-
-
-@pytest.mark.asyncio
-async def test_append():
-    seq = InProgressSequence[int]()
-    seq.append(1)
-    assert seq.current_length() == 1
-    assert await seq.get(0) == 1
-
-
-@pytest.mark.asyncio
-async def test_set_item():
-    seq = InProgressSequence[int]()
-    seq.set_item(2, 10)
-    assert seq.current_length() == 3
-    assert await seq.get(2) == 10
-
-
-@pytest.mark.asyncio
-async def test_set_item_out_of_range():
-    seq = InProgressSequence[int]()
-    with pytest.raises(IndexError):
-        seq.set_item(-1, 10)
-
-
-@pytest.mark.asyncio
-async def test_item_exception():
-    seq = InProgressSequence[int]()
-    seq.set_item(0, 5)
-    seq.item_exception(0, ValueError("Test Exception"))
-    with pytest.raises(ValueError, match="Test Exception"):
-        await seq.get(0)
-
-
-@pytest.mark.asyncio
-async def test_set_finished_length():
-    seq = InProgressSequence[int]()
-    seq.append(1)
-    seq.append(2)
-    seq.set_finished_length(2)
-    assert seq.is_finished()
-    assert seq.to_list() == [1, 2]
-
-
-@pytest.mark.asyncio
-async def test_set_finished_length_first():
-    seq = InProgressSequence[int]()
-    seq.set_finished_length(2)
-    seq.append(1)
-    seq.append(2)
-    assert seq.is_finished()
-    assert seq.to_list() == [1, 2]
-
-
-@pytest.mark.asyncio
-async def test_finalize():
-    seq = InProgressSequence[int]()
-    seq.append(1)
-    seq.append(2)
-    seq.finalize()
-    assert seq.is_finished()
-    assert seq.to_list() == [1, 2]
-
-
-@pytest.mark.asyncio
-async def test_exception_handling():
-    seq = InProgressSequence[int]()
-    seq.set_exception(ValueError("Test Exception"))
-    with pytest.raises(ValueError, match="Test Exception"):
-        await seq.finished_promise
-
-
-@pytest.mark.asyncio
-async def test_get_promise_immediate():
-    seq = InProgressSequence[int]()
-    seq.append(1)
-    promise = seq.get_promise(0)
-    assert await promise == 1
-
-
-@pytest.mark.asyncio
-async def test_get_promise_deferred():
-    seq = InProgressSequence[int]()
-    promise = seq.get_promise(0)
-    seq.append(2)
-    assert await promise == 2
-
-
-@pytest.mark.asyncio
-async def test_get_promise_out_of_range():
-    seq = InProgressSequence[int]()
-    seq.set_finished_length(2)
-    with pytest.raises(IndexError):
-        seq.get_promise(3)
-
-
-@pytest.mark.asyncio
-async def test_get_promise_with_future_exception():
-    seq = InProgressSequence[int]()
-    promise = seq.get_promise(0)
-    promise2 = seq.get_promise(0)
-    seq.item_exception(0, ValueError("Test Exception"))
-
-    with pytest.raises(ValueError, match="Test Exception"):
-        await promise
-
-    with pytest.raises(ValueError, match="Test Exception"):
-        await promise2
-
-
-@pytest.mark.asyncio
-async def test_get_promise_with_past_exception():
-    seq = InProgressSequence[int]()
-    seq.item_exception(0, ValueError("Test Exception"))
-    promise = seq.get_promise(0)
-    promise2 = seq.get_promise(0)
-    with pytest.raises(ValueError, match="Test Exception"):
-        await promise
-
-    with pytest.raises(ValueError, match="Test Exception"):
-        await promise2
diff --git a/tests/test_jagged_array.py b/tests/test_jagged_array.py
new file mode 100644
index 000000000..24ed24b08
--- /dev/null
+++ b/tests/test_jagged_array.py
@@ -0,0 +1,305 @@
+import math
+import tempfile
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+import pytest
+
+from levanter.store.jagged_array import JaggedArrayStore
+
+
+class TestJaggedArrayStore:
+    def test_append_and_get(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            builder = JaggedArrayStore.open(tmpdir, item_rank=2, dtype=jnp.float32)
+
+            data1 = jnp.array([[1.0, 2.0], [3.0, 4.0]])
+            data2 = jnp.array([[5.0]])
+
+            builder.append(data1)
+            builder.append(data2)
+
+            assert len(builder) == 2
+
+            result1 = builder[0]
+            assert jnp.all(result1 == data1)
+
+            result2 = builder[1]
+            assert jnp.all(result2 == data2)
+
+            # result_slice = builder[0:2]
+            # assert isinstance(result_slice, JaggedArray)
+
+    def test_extend_with_multiple(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            builder = JaggedArrayStore.open(tmpdir, item_rank=2, dtype=jnp.float32)
+
+            data1 = jnp.array([[1.0, 2.0], [3.0, 4.0]])
+            data2 = jnp.array([[5.0]])
+
+            builder.extend([data1, data2])
+
+            assert len(builder) == 2
+
+            result1 = builder[0]
+            assert jnp.all(result1 == data1)
+
+            result2 = builder[1]
+            assert jnp.all(result2 == data2)
+
+    def test_append_error(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            builder = JaggedArrayStore.open(tmpdir, item_rank=1, dtype=jnp.float32)
+            with pytest.raises(ValueError):
+                builder.append(jnp.array([[1.0, 2.0]]))
+
+    def test_append_single_rank(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            builder = JaggedArrayStore.open(tmpdir, item_rank=1, dtype=jnp.float32)
+
+            data = jnp.array([1.0, 2.0, 3.0])
+            builder.append(data)
+
+            assert len(builder) == 1
+
+            result = builder[0]
+            assert jnp.all(result == data)
+
+    def test_append_multi_rank(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            builder = JaggedArrayStore.open(tmpdir, item_rank=2, dtype=jnp.float32)
+
+            data1 = jnp.array([[1.0, 2.0], [3.0, 4.0]])
+            data2 = jnp.array([[5.0, 6.0], [7.0, 8.0]])
+
+            builder.append(data1)
+            builder.append(data2)
+
+            assert len(builder) == 2
+
+            result1 = builder[0]
+            assert jnp.all(result1 == data1)
+
+            result2 = builder[1]
+            assert jnp.all(result2 == data2)
+
+    def test_getitem_out_of_bounds(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            builder = JaggedArrayStore.open(tmpdir, item_rank=2, dtype=jnp.float32)
+
+            data = jnp.array([[1.0, 2.0], [3.0, 4.0]])
+            builder.append(data)
+
+            with pytest.raises(IndexError):
+                builder[2]
+
+    def test_step_slicing(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            builder = JaggedArrayStore.open(tmpdir, item_rank=2, dtype=jnp.float32)
+
+            data = jnp.array([[1.0, 2.0], [3.0, 4.0]])
+            builder.append(data)
+
+            # with pytest.raises(ValueError):
+            #     builder[::2]
+
+
+async def create_builder_with_data(directory, num_sequences: int, sequence_length: int | tuple[int, ...]):
+    if isinstance(sequence_length, int):
+        sequence_length = (sequence_length,)
+
+    """Helper function to create a JaggedArrayStore with specific data."""
+    seed = jax.random.PRNGKey(num_sequences * math.prod(sequence_length))
+
+    builder = await JaggedArrayStore.open_async(directory, item_rank=len(sequence_length), dtype=jnp.int64)
+    for i in range(num_sequences):
+        key, seed = jax.random.split(seed)
+        data = jax.random.randint(key, sequence_length, 0, 100)
+        await builder.append_async(data)
+
+    return builder
+
+
+def create_builder_with_data_sync(
+    directory, num_sequences: int, sequence_length: int | tuple[int, ...]
+) -> JaggedArrayStore:
+    if isinstance(sequence_length, int):
+        sequence_length = (sequence_length,)
+
+    """Helper function to create a JaggedArrayStore with specific data."""
+    seed = jax.random.PRNGKey(num_sequences * math.prod(sequence_length))
+
+    builder = JaggedArrayStore.open(directory, item_rank=len(sequence_length), dtype=jnp.int64)
+    for i in range(num_sequences):
+        key, seed = jax.random.split(seed)
+        data = jax.random.randint(key, sequence_length, 0, 100)
+        builder.append(data)
+
+    return builder
+
+
+@pytest.mark.asyncio
+async def test_trim_to_size_async():
+    tmpdir = tempfile.TemporaryDirectory().name
+    builder = await create_builder_with_data(tmpdir, num_sequences=10, sequence_length=1000)
+
+    # Initial size
+    initial_size = len(builder)
+    assert initial_size == 10
+
+    expected_data = list([builder[i] for i in range(10)])
+
+    # Trim to smaller size
+    await builder.trim_to_size_async(5)
+    new_size = len(builder)
+    assert new_size == 5
+
+    # Verify the data integrity
+    trimmed_data = await builder.data[0:5000].read()
+    assert jnp.all(trimmed_data == jnp.concatenate(expected_data[:5]))
+
+    # Trim to zero size
+    await builder.trim_to_size_async(0)
+    new_size = len(builder)
+    assert new_size == 0
+
+    # Verify the data integrity
+    trimmed_data = await builder.data[0:5000].read()
+    assert jnp.all(trimmed_data == 0)
+
+
+@pytest.mark.asyncio
+async def test_trim_to_size_larger_than_current():
+    tmpdir = tempfile.TemporaryDirectory().name
+    builder = await create_builder_with_data(tmpdir, num_sequences=10, sequence_length=1000)
+    expected_data = list([builder[i] for i in range(10)])
+
+    # Initial size
+    initial_size = len(builder)
+    assert initial_size == 10
+
+    # Trim to a larger size than current (should not change)
+    await builder.trim_to_size_async(15)
+    new_size = len(builder)
+    assert new_size == 10
+
+    # Verify the data integrity
+    trimmed_data = await builder.data[0:10000].read()
+    assert np.array_equal(trimmed_data, jnp.concatenate(expected_data[:10]))
+
+
+@pytest.mark.asyncio
+async def test_trim_to_size_with_shapes_async():
+    tmpdir = tempfile.TemporaryDirectory().name
+    builder = await create_builder_with_data(tmpdir, num_sequences=10, sequence_length=(10, 100))
+    expected_shapes = list(await builder.shapes[0:10].read())
+
+    # Trim to smaller size
+    await builder.trim_to_size_async(5)
+    new_size = len(builder)
+    assert new_size == 5
+
+    # Verify the shapes integrity
+    trimmed_shapes = await builder.shapes[0:5].read()
+    assert np.array_equal(trimmed_shapes, jnp.stack(expected_shapes[:5]))
+
+
+def test_trim_to_size():
+    tmpdir = tempfile.TemporaryDirectory().name
+    builder = create_builder_with_data_sync(tmpdir, num_sequences=10, sequence_length=1000)
+
+    # Initial size
+    initial_size = len(builder)
+    assert initial_size == 10
+
+    expected_data = list([builder[i] for i in range(10)])
+
+    # Trim to smaller size
+    builder.trim_to_size(5)
+    new_size = len(builder)
+    assert new_size == 5
+
+    # Verify the data integrity
+    trimmed_data = builder.data[0:5000].read().result()
+    assert jnp.all(trimmed_data == jnp.concatenate(expected_data[:5]))
+
+    # Trim to zero size
+    builder.trim_to_size(0)
+    new_size = len(builder)
+    assert new_size == 0
+
+    # Verify the data integrity
+    trimmed_data = builder.data[0:10000].read().result()
+    assert jnp.all(trimmed_data == 0)
+
+
+@pytest.mark.asyncio
+async def test_get_batch_single_item():
+    tmpdir = tempfile.TemporaryDirectory().name
+    builder = await create_builder_with_data(tmpdir, num_sequences=10, sequence_length=1000)
+
+    # Retrieve a single item using get_batch
+    batch = await builder.get_batch([3])
+    result = batch[0]
+
+    expected_data = await builder.get_item_async(3)
+
+    assert np.array_equal(result, expected_data)
+
+
+@pytest.mark.asyncio
+async def test_get_batch_multiple_items():
+    tmpdir = tempfile.TemporaryDirectory().name
+    builder = await create_builder_with_data(tmpdir, num_sequences=10, sequence_length=1000)
+
+    # Retrieve multiple items using get_batch
+    indices = [1, 4, 7]
+    batch = await builder.get_batch(indices)
+
+    for idx, result in zip(indices, batch):
+        expected_data = await builder.get_item_async(idx)
+        assert np.array_equal(result, expected_data)
+
+
+@pytest.mark.asyncio
+async def test_get_batch_out_of_order():
+    tmpdir = tempfile.TemporaryDirectory().name
+    builder = await create_builder_with_data(tmpdir, num_sequences=10, sequence_length=1000)
+
+    # Retrieve items out of order using get_batch
+    indices = [7, 2, 5]
+    batch = await builder.get_batch(indices)
+
+    for idx, result in zip(indices, batch):
+        expected_data = await builder.get_item_async(idx)
+        assert np.array_equal(result, expected_data)
+
+
+@pytest.mark.asyncio
+async def test_get_batch_with_shapes():
+    tmpdir = tempfile.TemporaryDirectory().name
+    builder = await create_builder_with_data(tmpdir, num_sequences=10, sequence_length=(10, 100))
+
+    # Retrieve multiple items using get_batch
+    indices = [0, 3, 6]
+    batch = await builder.get_batch(indices)
+
+    for idx, result in zip(indices, batch):
+        expected_data = await builder.get_item_async(idx)
+        assert np.array_equal(result, expected_data)
+
+
+@pytest.mark.asyncio
+async def test_get_batch_empty():
+    tmpdir = tempfile.TemporaryDirectory().name
+    builder = await create_builder_with_data(tmpdir, num_sequences=10, sequence_length=1000)
+
+    # Retrieve an empty batch
+    batch = await builder.get_batch([])
+
+    assert batch == []
+
+
+if __name__ == "__main__":
+    pytest.main()
diff --git a/tests/test_llama.py b/tests/test_llama.py
index 3fc6a551e..4277150fe 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -51,7 +51,6 @@ def test_llama_flops():
     llama_config = LlamaConfig.from_hf_config(hf_config)
     n_params = 6.738415616e9
     ratio = llama_config.flops_per_token(hf_config.vocab_size) / (2 * n_params)
-    print(ratio)
     assert ratio > 1.1, f"ratio {ratio} < 1.1"
     assert ratio < 1.2, f"ratio {ratio} > 1.2"
 
@@ -386,6 +385,4 @@ def test_state_dict_consistency(scan_layers, num_kv_heads):
     model = LlamaLMHeadModel.init(Vocab=Vocab, config=config, key=random.PRNGKey(0))
     hf_config = config.to_hf_config(Vocab.size)
     hf_model = LlamaForCausalLM(hf_config)
-    print(hf_model.state_dict().keys())
-    print(model.to_state_dict().keys())
     assert set(hf_model.state_dict().keys()) == set(model.to_state_dict().keys())
diff --git a/tests/test_lora.py b/tests/test_lora.py
index f9268d350..f7d852531 100644
--- a/tests/test_lora.py
+++ b/tests/test_lora.py
@@ -113,6 +113,7 @@ def test_lora_peft_integration():
     hf_dict = get_peft_model_state_dict(model)
 
     converter = Gpt2Config().hf_checkpoint_converter()
+
     lev_model = converter.load_pretrained(converter.default_config.model_type, "stanford-crfm/expanse-gpt2-small-x777")
 
     lora_lev_model = loraize(lev_model, LoraConfig(r=8, target_modules=["c_attn"]), key=jax.random.PRNGKey(0))
@@ -168,8 +169,8 @@ def replace_dot_general(x):
             return PreciseDotGeneralOp()
         return x
 
-    merged = jax.tree_map(replace_dot_general, merged, is_leaf=lambda x: isinstance(x, DefaultDotGeneralOp))
-    loraized = jax.tree_map(replace_dot_general, loraized, is_leaf=lambda x: isinstance(x, DefaultDotGeneralOp))
+    merged = jax.tree.map(replace_dot_general, merged, is_leaf=lambda x: isinstance(x, DefaultDotGeneralOp))
+    loraized = jax.tree.map(replace_dot_general, loraized, is_leaf=lambda x: isinstance(x, DefaultDotGeneralOp))
 
     input = hax.random.normal(k0, (In,))
     # light tolerances for TPU
diff --git a/tests/test_mixture.py b/tests/test_mixture.py
new file mode 100644
index 000000000..e8821e24f
--- /dev/null
+++ b/tests/test_mixture.py
@@ -0,0 +1,155 @@
+import jax
+import numpy as np
+import pytest
+
+from levanter.data import ListAsyncDataset, MixtureDataset
+from levanter.data.mixture import StopStrategy
+
+
+def datasets():
+    ds1 = ListAsyncDataset([1, 2, 3, 4, 5])
+    ds2 = ListAsyncDataset([10, 20, 30, 40, 50])
+    ds3 = ListAsyncDataset([100, 200, 300, 400, 500])
+    ds1.finalize()
+    ds2.finalize()
+    ds3.finalize()
+    return {"ds1": ds1, "ds2": ds2, "ds3": ds3}
+
+
+def weights():
+    return {"ds1": 0.5, "ds2": 0.3, "ds3": 0.2}
+
+
+def block_size():
+    return 10
+
+
+def key():
+    return jax.random.PRNGKey(42)
+
+
+@pytest.mark.asyncio
+async def test_mixture_dataset_getitem():
+    mixture_ds = MixtureDataset(datasets(), weights(), 10, key=key, randomize_blocks=False)
+
+    item = await mixture_ds.getitem_async(0)
+    assert item in [1, 10, 100], f"Unexpected item: {item}"
+
+
+@pytest.mark.asyncio
+async def test_mixture_dataset_get_batch():
+    mixture_ds = MixtureDataset(datasets(), weights(), 10, key=key(), randomize_blocks=False)
+
+    batch = await mixture_ds.get_batch([0, 1, 2])
+    assert len(batch) == 3
+    assert all(item in [1, 2, 3, 10, 20, 30, 100, 200, 300] for item in batch)
+
+
+@pytest.mark.asyncio
+async def test_mixture_dataset_block_assignments():
+    mixture_ds = MixtureDataset(datasets(), weights(), 10, key=key())
+
+    block_assignment = await mixture_ds._get_block(0)
+    assert block_assignment is not None
+    assert len(block_assignment) == 10
+
+
+@pytest.mark.skip
+@pytest.mark.asyncio
+async def test_mixture_dataset_stop_strategy_first():
+    mixture_ds = MixtureDataset(datasets(), weights(), 10, key=key, stop_strategy=StopStrategy.FIRST_STOP_STRATEGY)
+
+    with pytest.raises(NotImplementedError):
+        await mixture_ds.async_len()
+
+
+@pytest.mark.asyncio
+async def test_mixture_dataset_stop_strategy_restart():
+    mixture_ds = MixtureDataset(
+        datasets(), weights(), block_size=10, key=key(), stop_strategy=StopStrategy.RESTART_STRATEGY
+    )
+
+    with pytest.raises(ValueError):
+        await mixture_ds.async_len()
+
+
+@pytest.mark.asyncio
+async def test_mixture_dataset_normalized_weights():
+    weights = {"ds1": 0, "ds2": 0.5, "ds3": 0.5}
+    mixture_ds = MixtureDataset(datasets(), weights, block_size=10, key=key(), randomize_blocks=False)
+
+    batch = await mixture_ds.get_batch([0, 1, 2])
+    assert len(batch) == 3
+    assert all(item in [10, 20, 30, 100, 200, 300] for item in batch)
+
+
+@pytest.mark.asyncio
+async def test_mixture_dataset_unpermuted_ids():
+    mixture_ds = MixtureDataset(datasets(), weights(), block_size=10, key=key())
+
+    unpermuted_ids = mixture_ds._compute_unpermuted_ids(mixture_ds._counts_per_block)
+    assert len(unpermuted_ids) == 10
+    assert unpermuted_ids[0] >> 32 in range(3)  # Ensure the dataset ID is valid
+
+
+@pytest.mark.asyncio
+async def test_mixture_dataset_remap_indices():
+    dses = datasets()
+    mixture_ds = MixtureDataset(dses, weights(), block_size=10, key=key())
+
+    remapped_indices = await mixture_ds._remap_indices(dses["ds1"], [0, 1, 2])
+    assert len(remapped_indices) == 3
+    assert remapped_indices == [0, 1, 2]
+
+    # check wrap around
+    len_ds1 = await dses["ds1"].async_len()
+    remapped_indices = await mixture_ds._remap_indices(dses["ds1"], [len_ds1 - 1, len_ds1, len_ds1 + 1])
+    assert len(remapped_indices) == 3
+
+    assert remapped_indices == [len_ds1 - 1, 0, 1]
+
+
+@pytest.mark.asyncio
+async def test_mixture_dataset_respects_weights():
+    w = weights()
+    mixture_ds = MixtureDataset(datasets(), w, block_size(), key=key())
+
+    # Check that the dataset respects the weights
+    num_samples = 1000
+    samples = await mixture_ds.get_batch(list(range(num_samples)))
+
+    counts = {"ds1": 0, "ds2": 0, "ds3": 0}
+    for sample in samples:
+        if sample < 10:
+            counts["ds1"] += 1
+        elif sample < 100:
+            counts["ds2"] += 1
+        else:
+            counts["ds3"] += 1
+
+    for dataset, count in counts.items():
+        assert abs(count / num_samples - w[dataset]) < 0.1, f"Dataset {dataset} has unexpected weight"
+
+
+@pytest.mark.asyncio
+async def test_mixture_dataset_randomizes_blocks():
+    mixture_ds = MixtureDataset(datasets(), weights(), block_size=10, key=key())
+
+    block_assignment_1 = await mixture_ds._get_block(0)
+    block_assignment_2 = await mixture_ds._get_block(0)
+
+    assert np.all(block_assignment_1 == block_assignment_2), "Block assignments should be randomized"
+
+    block_assignment_3 = await mixture_ds._get_block(1)
+    assert not np.all(block_assignment_1 == block_assignment_3), "Block assignments should be randomized"
+
+
+@pytest.mark.asyncio
+async def test_mixture_dataset_samples_all_elements():
+    mixture_ds = MixtureDataset(datasets(), weights(), block_size=10, key=key())
+
+    num_samples = 1000
+    samples = await mixture_ds.get_batch(list(range(num_samples)))
+
+    assert len(samples) == num_samples
+    assert set(samples) == {1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 100, 200, 300, 400, 500}
diff --git a/tests/test_new_cache.py b/tests/test_new_cache.py
new file mode 100644
index 000000000..3302674de
--- /dev/null
+++ b/tests/test_new_cache.py
@@ -0,0 +1,921 @@
+import asyncio
+import logging
+import tempfile
+from typing import Iterator, Sequence
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+import ray
+from ray.exceptions import RayTaskError
+
+from levanter.data import BatchProcessor, ShardedDataSource, batched
+from levanter.data.sharded_datasource import TextUrlDataSource
+from levanter.store.cache import (
+    SerialCacheWriter,
+    TreeStore,
+    _get_builder_actor,
+    _OrderedCacheWriter,
+    build_or_load_cache,
+)
+from levanter.utils.py_utils import logical_cpu_core_count
+from levanter.utils.ray_utils import ExceptionInfo, SnitchRecipient, ser_exc_info
+
+
+class TestProcessor(BatchProcessor[Sequence[int], dict[str, np.ndarray]]):
+    def __init__(self, batch_size: int = 8):
+        self._batch_size = batch_size
+
+    def __call__(self, batch: Sequence[Sequence[int]]) -> Sequence[dict[str, np.ndarray]]:
+        # return pa.RecordBatch.from_arrays([pa.array(batch)], ["test"])
+        return [{"test": np.asarray(x)} for x in batch]
+
+    @property
+    def output_exemplar(self):
+        return {"test": np.array([0], dtype=np.int64)}
+
+    @property
+    def batch_size(self) -> int:
+        return self._batch_size
+
+    @property
+    def num_cpus(self) -> int:
+        return 1
+
+
+def simple_process(processor, source):
+    result = []
+    for shard_name in source.shard_names:
+        for batch in source.open_shard(shard_name):
+            result.append(processor([batch])[0])
+
+    return result
+
+
+def process_interleave(processor, source):
+    batch_size = processor.batch_size
+    shard_iterators = {
+        shard_name: batched(iter(source.open_shard(shard_name)), batch_size) for shard_name in source.shard_names
+    }
+    finished = 0
+
+    while finished < len(shard_iterators):
+        for shard_name, shard_iter in shard_iterators.items():
+            if shard_iter is None:
+                continue
+            try:
+                batch = next(shard_iter)
+                yield from processor(batch)
+            except StopIteration:
+                shard_iterators[shard_name] = None
+                finished += 1
+
+
+def setup_module(module):
+    ray.init(
+        "local", num_cpus=max(2 * logical_cpu_core_count(), 8), ignore_reinit_error=True
+    )  # 2x cpu count is faster on my m1
+
+
+def teardown_module(module):
+    ray.shutdown()
+
+
+class SimpleProcessor(BatchProcessor[Sequence[int], dict[str, np.ndarray]]):
+    def __init__(self, batch_size: int = 8):
+        self._batch_size = batch_size
+
+    def __call__(self, batch: Sequence[Sequence[int]]) -> Sequence[dict[str, Sequence[int]]]:
+        return [{"data": x} for x in batch]
+
+    @property
+    def batch_size(self) -> int:
+        return self._batch_size
+
+    @property
+    def num_cpus(self) -> int:
+        return 1
+
+    @property
+    def output_exemplar(self) -> dict[str, np.ndarray]:
+        return {"data": np.array([0], dtype=np.int64)}
+
+
+class SimpleShardSource(ShardedDataSource[list[int]]):
+    def __init__(self, num_shards: int = 4):
+        self._num_shards = num_shards
+
+    @property
+    def shard_names(self) -> Sequence[str]:
+        return [f"shard_{i}" for i in range(self._num_shards)]
+
+    def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[list[int]]:
+        # parse the shard name to get the shard number
+        shard_num = int(shard_name.split("_")[1])
+        return ([shard_num * 10 + i] * 10 for i in range(row, 10))
+
+
+def test_serial_cache_writer():
+    with tempfile.TemporaryDirectory() as tmpdir1:
+        source = SimpleShardSource(num_shards=4)
+        processor = SimpleProcessor()
+
+        exemplar = {"data": np.array([0], dtype=np.int64)}
+
+        with SerialCacheWriter(tmpdir1, exemplar) as writer:
+            for shard_name in source.shard_names:
+                for ex in batched(source.open_shard(shard_name), processor.batch_size):
+                    writer.write_batch(processor(ex))
+
+        _ = writer.result()
+        data_path = writer._tree_store.path
+
+        builder = TreeStore.open(exemplar, data_path, mode="r")
+
+        assert len(builder) == 40
+
+        for i, x in enumerate(builder):
+            np.testing.assert_array_equal(x["data"], np.asarray([i % 10 + i // 10 * 10] * 10))
+
+
+def crappy_du(path):
+    import os
+
+    total = 0
+    for root, dirs, files in os.walk(path):
+        for f in files:
+            total += os.path.getsize(os.path.join(root, f))
+    return total
+
+
+@ray.remote
+class PretendParent(SnitchRecipient):
+    def __init__(self):
+        self.logger = logging.getLogger("SnitchRecipient")
+        self.failure_received = asyncio.Event()
+        self.exception_info = None
+        self._finished_shards = set()
+        self._finished = False
+        self._ledger = None
+        self._desired_next_item = None
+
+    def _child_failed(self, child: ray.actor.ActorHandle, exception: ExceptionInfo):
+        try:
+            self.logger.error(f"Child {child} failed with exception {exception}")
+            self.exception_info = exception
+            self.failure_received.set()
+        except Exception as e:
+            self.logger.error(f"Error in _child_failed: {e}")
+
+    def shard_failed(self, shard_name, exc_info):
+        self.exception_info = exc_info
+        self.failure_received.set()
+
+    async def wait_for_failure(self):
+        await self.failure_received.wait()
+        return self.exception_info
+
+    def shard_finished(self, shard_name):
+        self._finished_shards.add(shard_name)
+
+    def get_finished_shards(self):
+        return self._finished_shards
+
+    def _updated_ledger(self, ledger):
+        if ledger.is_finished:
+            self._finished = True
+
+        self._ledger = ledger
+
+    def _finalize(self):
+        self._finished = True
+
+    def is_finished(self):
+        return self._finished
+
+    def signal_backpressure(self, desired_next_item: float):
+        self._desired_next_item = desired_next_item
+
+    def desired_next_item(self):
+        return self._desired_next_item
+
+
+@pytest.mark.asyncio
+async def test_batch_finished():
+    parent = PretendParent.remote()
+    exemplar = np.array([1, 2, 3])
+    with tempfile.TemporaryDirectory() as cache_dir:
+        shards = ["shard1", "shard2", "shard3"]
+        writer = _OrderedCacheWriter.remote(
+            parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards, min_items_to_write=1
+        )
+
+        try:
+            shard_idx = "shard1"
+            shard_batch_idx = 0
+            batch_result = [np.array([1, 2, 3])]
+
+            await writer.batch_finished.remote(shard_idx, shard_batch_idx, batch_result)
+            shard_status = await writer.get_shard_status.remote("shard1")
+            assert shard_status.num_rows_committed == 1
+        finally:
+            ray.kill(parent)
+            ray.kill(writer)
+
+
+@pytest.mark.asyncio
+async def test_shard_finished_reading():
+    parent = PretendParent.remote()
+    exemplar = MagicMock()
+    with tempfile.TemporaryDirectory() as cache_dir:
+        shards = ["shard1", "shard2", "shard3"]
+        writer = _OrderedCacheWriter.remote(parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards)
+
+        try:
+            shard_name = "shard1"
+            expected_batches = 5
+
+            await writer.shard_finished_reading.remote(shard_name, expected_batches)
+            shard_status = await writer.get_shard_status.remote(shard_name)
+            assert shard_status.is_finished is False
+        finally:
+            ray.kill(parent)
+            ray.kill(writer)
+
+
+@pytest.mark.asyncio
+async def test_get_shard_status():
+    parent = PretendParent.remote()
+    exemplar = np.array([1, 2, 3])
+    with tempfile.TemporaryDirectory() as cache_dir:
+        shards = ["shard1", "shard2", "shard3"]
+        writer = _OrderedCacheWriter.remote(parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards)
+
+        try:
+            shard_name = "shard1"
+            shard_status = await writer.get_shard_status.remote(shard_name)
+
+            assert shard_status.shard_name == shard_name
+            assert shard_status.num_rows_committed == 0
+            assert not shard_status.is_finished
+        finally:
+            ray.kill(parent)
+            ray.kill(writer)
+
+
+@pytest.mark.asyncio
+async def test_shard_failed():
+    parent = PretendParent.remote()
+    exemplar = MagicMock()
+    with tempfile.TemporaryDirectory() as cache_dir:
+        shards = ["shard1", "shard2", "shard3"]
+        writer = _OrderedCacheWriter.remote(parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards)
+
+        try:
+            shard_name = "shard1"
+            batch_id = 0
+            try:
+                raise Exception("Test Exception")
+            except:  # noqa
+                exc_info = ser_exc_info()
+
+            await writer.shard_failed.remote(shard_name, batch_id, exc_info)
+            exception_received = await parent.wait_for_failure.remote()
+            assert str(exception_received.ex) == str(exc_info.ex)
+        finally:
+            ray.kill(parent)
+            ray.kill(writer)
+
+
+DEFAULT_BATCH_SIZE = 128
+
+
+@pytest.mark.asyncio
+async def test_attempt_to_write_batches():
+    parent = PretendParent.remote()
+    exemplar = np.array([1, 2, 3])
+    with tempfile.TemporaryDirectory() as cache_dir:
+        shards = ["shard1", "shard2", "shard3"]
+        writer = _OrderedCacheWriter.remote(
+            parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards, min_items_to_write=2
+        )
+
+        try:
+            shard1_batch = [np.asarray([1, 2, 3])]
+            shard2_batch = [np.asarray([4, 5, 6, 7])]
+
+            await writer.batch_finished.remote("shard1", 0, shard1_batch)
+            await writer.batch_finished.remote("shard2", 0, shard2_batch)
+
+            ledger = await writer.get_ledger.remote()
+            assert ledger.is_finished is False
+            assert ledger.total_num_rows == 2  # Assuming each batch has 1 row for simplicity
+
+            store = TreeStore.open(exemplar, cache_dir, mode="r")
+            assert len(store) == 2
+            np.testing.assert_array_equal(store[0], shard1_batch[0])
+            np.testing.assert_array_equal(store[1], shard2_batch[0])
+        finally:
+            ray.kill(parent)
+            ray.kill(writer)
+
+
+@pytest.mark.asyncio
+async def test_finalize_cache():
+    parent = PretendParent.remote()
+    exemplar = np.array([1, 2, 3])
+    with tempfile.TemporaryDirectory() as cache_dir:
+        shards = ["shard1", "shard2", "shard3"]
+        writer = _OrderedCacheWriter.remote(parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards)
+
+        try:
+            shard1_batch = [np.array([1, 2, 3])]
+            shard2_batch = [np.array([4, 5, 6, 7])]
+
+            await writer.batch_finished.remote("shard1", 0, shard1_batch)
+            await writer.shard_finished_reading.remote("shard1", 1)
+            await writer.shard_finished_reading.remote("shard2", 1)
+            await writer.batch_finished.remote("shard2", 0, shard2_batch)
+
+            ledger = await writer.get_ledger.remote()
+            assert ledger.is_finished is False
+            assert ledger.total_num_rows == 2  # Assuming each batch has 1 row for simplicity
+
+            await writer.shard_finished_reading.remote("shard3", 0)
+            finished_shards = await parent.get_finished_shards.remote()
+            assert len(finished_shards) == 3
+
+            ledger = await writer.get_ledger.remote()
+            assert ledger.is_finished is True
+            assert ledger.total_num_rows == 2
+            assert await parent.is_finished.remote() is True
+        finally:
+            ray.kill(parent)
+            ray.kill(writer)
+
+
+@pytest.mark.asyncio
+async def test_error_handling():
+    parent = PretendParent.remote()
+    exemplar = np.array([1, 2, 3])
+    with tempfile.TemporaryDirectory() as cache_dir:
+        shards = ["shard1", "shard2", "shard3"]
+        writer = _OrderedCacheWriter.remote(parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards)
+
+        try:
+            with pytest.raises(TypeError):
+                await writer.batch_finished.remote("shard1", 0, None)
+
+            exception_received = await parent.wait_for_failure.remote()
+            assert exception_received is not None
+        finally:
+            ray.kill(parent)
+            ray.kill(writer)
+
+
+@pytest.mark.asyncio
+async def test_out_of_order_batches_same_shard():
+    parent = PretendParent.remote()
+    exemplar = np.array([1, 2, 3])
+    with tempfile.TemporaryDirectory() as cache_dir:
+        shards = ["shard1"]
+        writer = _OrderedCacheWriter.remote(
+            parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards, min_items_to_write=2
+        )
+
+        try:
+            # Sending batch 1 before batch 0 for shard1
+            shard1_batch0 = [np.array([1, 2, 3])]
+            shard1_batch1 = [np.array([4, 5, 6])]
+
+            await writer.batch_finished.remote("shard1", 1, shard1_batch1)
+            await writer.batch_finished.remote("shard1", 0, shard1_batch0)
+
+            store = TreeStore.open(exemplar, cache_dir, mode="r")
+            assert len(store) == 2
+            np.testing.assert_array_equal(store[0], shard1_batch0[0])
+            np.testing.assert_array_equal(store[1], shard1_batch1[0])
+        finally:
+            ray.kill(parent)
+            ray.kill(writer)
+
+
+@pytest.mark.asyncio
+async def test_out_of_order_batches_different_shards():
+    parent = PretendParent.remote()
+    exemplar = np.array([1, 2, 3])
+    with tempfile.TemporaryDirectory() as cache_dir:
+        shards = ["shard1", "shard2"]
+        writer = _OrderedCacheWriter.remote(
+            parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards, min_items_to_write=3
+        )
+
+        try:
+            # Sending batches out of order across different shards
+            shard1_batch0 = [np.array([1, 2, 3])]
+            shard2_batch0 = [np.array([4, 5, 6])]
+            shard1_batch1 = [np.array([7, 8, 9])]
+
+            await writer.batch_finished.remote("shard1", 1, shard1_batch1)
+            await writer.batch_finished.remote("shard2", 0, shard2_batch0)
+            await writer.batch_finished.remote("shard1", 0, shard1_batch0)
+
+            store = TreeStore.open(exemplar, cache_dir, mode="r")
+            assert len(store) == 3
+            np.testing.assert_array_equal(store[0], shard1_batch0[0])
+            np.testing.assert_array_equal(store[1], shard2_batch0[0])
+            np.testing.assert_array_equal(store[2], shard1_batch1[0])
+        finally:
+            ray.kill(parent)
+            ray.kill(writer)
+
+
+@pytest.mark.asyncio
+async def test_batches_different_orders_all_shards():
+    parent = PretendParent.remote()
+    exemplar = np.array([1, 2, 3])
+    with tempfile.TemporaryDirectory() as cache_dir:
+        shards = ["shard1", "shard2", "shard3"]
+        writer = _OrderedCacheWriter.remote(
+            parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards, min_items_to_write=2
+        )
+
+        try:
+            # Sending batches in different orders across all shards
+            shard1_batch0 = [np.array([1, 2, 3])]
+            shard1_batch1 = [np.array([4, 5, 6])]
+            shard2_batch0 = [np.array([7, 8, 9])]
+            shard3_batch0 = [np.array([10, 11, 12])]
+
+            await writer.batch_finished.remote("shard2", 0, shard2_batch0)
+            await writer.batch_finished.remote("shard3", 0, shard3_batch0)
+            await writer.batch_finished.remote("shard1", 1, shard1_batch1)
+            await writer.batch_finished.remote("shard1", 0, shard1_batch0)
+
+            store = TreeStore.open(exemplar, cache_dir, mode="r")
+            assert len(store) == 4
+            np.testing.assert_array_equal(store[0], shard1_batch0[0])
+            np.testing.assert_array_equal(store[1], shard2_batch0[0])
+            np.testing.assert_array_equal(store[2], shard3_batch0[0])
+            np.testing.assert_array_equal(store[3], shard1_batch1[0])
+        finally:
+            ray.kill(parent)
+            ray.kill(writer)
+
+
+@pytest.mark.asyncio
+async def test_intermixed_batches_same_and_different_shards():
+    parent = PretendParent.remote()
+    exemplar = np.array([1, 2, 3])
+    with tempfile.TemporaryDirectory() as cache_dir:
+        shards = ["shard1", "shard2", "shard3"]
+        writer = _OrderedCacheWriter.remote(
+            parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards, min_items_to_write=1
+        )
+
+        try:
+            # Sending intermixed batches from the same and different shards
+            shard1_batch0 = [np.array([1, 2, 3])]
+            shard2_batch0 = [np.array([4, 5, 6])]
+            shard1_batch1 = [np.array([7, 8, 9])]
+            shard3_batch0 = [np.array([10, 11, 12])]
+            shard2_batch1 = [np.array([13, 14, 15])]
+
+            await writer.batch_finished.remote("shard2", 0, shard2_batch0)
+            await writer.batch_finished.remote("shard3", 0, shard3_batch0)
+            await writer.batch_finished.remote("shard1", 1, shard1_batch1)
+            await writer.batch_finished.remote("shard2", 1, shard2_batch1)
+            await writer.batch_finished.remote("shard1", 0, shard1_batch0)
+
+            store = TreeStore.open(exemplar, cache_dir, mode="r")
+            assert len(store) == 5
+            np.testing.assert_array_equal(store[0], shard1_batch0[0])
+            np.testing.assert_array_equal(store[1], shard2_batch0[0])
+            np.testing.assert_array_equal(store[2], shard3_batch0[0])
+            np.testing.assert_array_equal(store[3], shard1_batch1[0])
+            np.testing.assert_array_equal(store[4], shard2_batch1[0])
+        finally:
+            ray.kill(parent)
+            ray.kill(writer)
+
+
+@pytest.mark.asyncio
+async def test_duplicate_batches_same_shard():
+    parent = PretendParent.remote()
+    exemplar = np.array([1, 2, 3])
+    with tempfile.TemporaryDirectory() as cache_dir:
+        shards = ["shard1"]
+        writer = _OrderedCacheWriter.remote(parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards)
+
+        try:
+            # Sending duplicate batches for the same shard
+            shard1_batch0 = [np.array([1, 2, 3])]
+
+            await writer.batch_finished.remote("shard1", 0, shard1_batch0)
+            with pytest.raises(RayTaskError):
+                await writer.batch_finished.remote("shard1", 0, shard1_batch0)  # Duplicate
+        finally:
+            ray.kill(parent)
+            ray.kill(writer)
+
+
+@pytest.mark.asyncio
+async def test_mixed_order_batches_multiple_shards():
+    parent = PretendParent.remote()
+    exemplar = np.array([1, 2, 3])
+    with tempfile.TemporaryDirectory() as cache_dir:
+        shards = ["shard1", "shard2", "shard3"]
+        writer = _OrderedCacheWriter.remote(
+            parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards, min_items_to_write=1
+        )
+
+        try:
+            # Sending batches in mixed order for multiple shards
+            shard1_batch0 = [np.array([1, 2, 3])]
+            shard2_batch0 = [np.array([4, 5, 6])]
+            shard1_batch1 = [np.array([7, 8, 9])]
+            shard2_batch1 = [np.array([10, 11, 12])]
+            shard3_batch0 = [np.array([13, 14, 15])]
+            shard3_batch1 = [np.array([16, 17, 18])]
+
+            await writer.batch_finished.remote("shard3", 0, shard3_batch0)
+            await writer.batch_finished.remote("shard1", 1, shard1_batch1)
+            await writer.batch_finished.remote("shard2", 0, shard2_batch0)
+            await writer.batch_finished.remote("shard2", 1, shard2_batch1)
+            await writer.batch_finished.remote("shard1", 0, shard1_batch0)
+            await writer.batch_finished.remote("shard3", 1, shard3_batch1)
+
+            store = TreeStore.open(exemplar, cache_dir, mode="r")
+            assert len(store) == 6
+            np.testing.assert_array_equal(store[0], shard1_batch0[0])
+            np.testing.assert_array_equal(store[1], shard2_batch0[0])
+            np.testing.assert_array_equal(store[2], shard3_batch0[0])
+            np.testing.assert_array_equal(store[3], shard1_batch1[0])
+            np.testing.assert_array_equal(store[4], shard2_batch1[0])
+            np.testing.assert_array_equal(store[5], shard3_batch1[0])
+        finally:
+            ray.kill(parent)
+            ray.kill(writer)
+
+
+@pytest.mark.ray
+def test_full_end_to_end_cache_simple():
+    td = tempfile.TemporaryDirectory()
+    with td as tmpdir:
+        ray_ds = build_or_load_cache(
+            tmpdir,
+            SimpleShardSource(num_shards=1),
+            TestProcessor(),
+            await_finished=True,
+        )
+
+        simple_processed = simple_process(TestProcessor(), SimpleShardSource())
+
+        all_data = ray_ds[:]
+
+        check_datasets_equal(all_data, simple_processed)
+
+
+@pytest.mark.ray
+def test_cache_remembers_its_cached():
+    directory = tempfile.TemporaryDirectory()
+    with directory as tmpdir:
+        ds1 = build_or_load_cache(tmpdir, SimpleShardSource(), TestProcessor())
+
+        class ThrowingProcessor(BatchProcessor[Sequence[int], dict[str, np.ndarray]]):
+            def __call__(self, batch: Sequence[Sequence[int]]):
+                raise RuntimeError("This should not be called")
+
+            @property
+            def output_exemplar(self) -> dict[str, np.ndarray]:
+                return {"test": np.array([0], dtype=np.int64)}
+
+            @property
+            def batch_size(self) -> int:
+                return 8
+
+            @property
+            def num_cpus(self) -> int:
+                return 1
+
+        # testing this doesn't throw
+        ds2 = build_or_load_cache(tmpdir, SimpleShardSource(), ThrowingProcessor(), await_finished=True)
+
+        check_datasets_equal(ds1, ds2)
+
+
+def check_datasets_equal(ds1, ds2):
+    for r1, r2 in zip(ds1, ds2):
+        assert r1.keys() == r2.keys()
+        for key in r1.keys():
+            np.testing.assert_array_equal(r1[key], r2[key])
+
+
+class _CustomException(Exception):
+    pass
+
+
+@pytest.mark.ray
+@pytest.mark.skip("This test segfaults in CI. I think a ray bug")
+def test_cache_recover_from_crash():
+    class CrashingShardSource(ShardedDataSource[list[int]]):
+        def __init__(self, crash_point: int):
+            self.crash_point = crash_point
+
+        @property
+        def shard_names(self) -> Sequence[str]:
+            return [f"shard_{i}" for i in range(4)]
+
+        def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[list[int]]:
+            # parse the shard name to get the shard number
+            shard_num = int(shard_name.split("_")[1])
+            for i in range(10):
+                if shard_num * 10 + i == self.crash_point:
+                    raise _CustomException(f"Crashing at {shard_num} {i} {self.crash_point}")
+                if i >= row:
+                    yield [shard_num * 10 + i] * 10
+
+    with tempfile.TemporaryDirectory() as tmpdir, tempfile.TemporaryDirectory() as tmpdir2:
+        source = CrashingShardSource(4)
+        with pytest.raises(_CustomException):
+            build_or_load_cache(tmpdir, source, TestProcessor())
+
+        # kill the broker actor so that we can test recovery
+        ray.kill(
+            _get_builder_actor(tmpdir, source, TestProcessor()),
+            no_restart=True,
+        )
+
+        source = CrashingShardSource(5)
+        with pytest.raises(_CustomException):
+            build_or_load_cache(tmpdir, source, TestProcessor())
+
+        ray.kill(
+            _get_builder_actor(tmpdir, source, TestProcessor()),
+            no_restart=True,
+        )
+
+        # testing this doesn't throw
+        source = CrashingShardSource(1000)
+        reader1 = build_or_load_cache(tmpdir, source, TestProcessor(), await_finished=True)
+
+        # compare to the original with no crash
+        reader2 = build_or_load_cache(tmpdir2, SimpleShardSource(), TestProcessor(), await_finished=True)
+
+        assert len(list(reader1)) == 40
+        check_datasets_equal(reader1, reader2)
+
+
+@pytest.mark.ray
+def test_no_hang_if_empty_shard_source():
+    class EmptyShardSource(ShardedDataSource[list[int]]):
+        @property
+        def shard_names(self) -> Sequence[str]:
+            return []
+
+        def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[list[int]]:
+            raise RuntimeError("This should not be called")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        reader = build_or_load_cache(tmpdir, EmptyShardSource(), TestProcessor())
+        assert list(reader) == []
+
+
+@pytest.mark.ray
+def test_chunk_ordering_is_correct_with_slow_shards():
+    class SlowShardSource(ShardedDataSource[list[int]]):
+        @property
+        def shard_names(self) -> Sequence[str]:
+            return ["shard_0", "shard_1"]
+
+        def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[list[int]]:
+            max_count = 40 if shard_name == "shard_1" else 20
+            shard_id = int(shard_name.split("_")[1])
+            for i in range(0, max_count):
+                yield [i * 10 + shard_id] * 10
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        cache = build_or_load_cache(
+            tmpdir,
+            SlowShardSource(),
+            TestProcessor(1),
+            await_finished=False,
+        )
+
+        # now block until the cache is done
+        cache.await_finished(timeout=10)
+
+        expected = process_interleave(TestProcessor(1), SlowShardSource())
+
+        check_datasets_equal(list(cache[:]), expected)
+
+
+@pytest.mark.asyncio
+@pytest.mark.ray
+async def test_can_get_elems_before_finished():
+    @ray.remote(num_cpus=0)
+    class Blocker:
+        def __init__(self):
+            self.future = asyncio.Future()
+
+        async def block(self):
+            await self.future
+
+        def unblock(self):
+            self.future.set_result(None)
+
+    blocker_to_wait_on_test = Blocker.remote()
+
+    class SlowShardSource(ShardedDataSource[list[int]]):
+        @property
+        def shard_names(self) -> Sequence[str]:
+            return ["shard_0"]
+
+        def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[list[int]]:
+            for i in range(10):
+                yield [i] * 10
+            ray.get(blocker_to_wait_on_test.block.remote())
+            for i in range(10, 20):
+                yield [i] * 10
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        cache = build_or_load_cache(
+            tmpdir, SlowShardSource(), TestProcessor(5), await_finished=False, items_per_write=5
+        )
+
+        # read the first 10 elements
+        # ensure the first 10 elements are [{"test": np.array([i] * 10)} for i in range(10)]
+        first_10 = list(await cache.get_batch(range(0, 10)))
+
+        for i, x in enumerate(first_10):
+            np.testing.assert_array_equal(x["test"], np.array([i] * 10))
+
+        with pytest.raises(asyncio.TimeoutError):
+            await asyncio.wait_for(cache.get_batch(range(10, 20)), timeout=0.1)
+
+        # then unblock:
+        ray.get(blocker_to_wait_on_test.unblock.remote())
+
+        # now ensure we can get the next 10 elements, which will be
+        # [{"test": np.array([i] * 10)} for i in range(10, 20)]
+        batch = await asyncio.wait_for(cache.get_batch(range(10, 20)), timeout=10)
+
+        for i, x in enumerate(batch):
+            np.testing.assert_array_equal(x["test"], np.array([i + 10] * 10))
+
+        ray.get(blocker_to_wait_on_test.block.remote())
+
+        # now wait until the cache is finished. mostly so that the tempdir cleanup works
+        cache.await_finished(timeout=10)
+
+
+@pytest.mark.skip("This test segfaults in CI. I think a ray bug")
+@pytest.mark.ray
+def test_shard_cache_crashes_if_processor_throws():
+    class ThrowingProcessor(BatchProcessor[Sequence[int], dict[str, np.ndarray]]):
+        def __call__(self, batch: Sequence[Sequence[int]]):
+            raise RuntimeError("exc")
+
+        @property
+        def output_exemplar(self) -> dict:
+            return {"test": np.array([0], dtype=np.int64)}
+
+        @property
+        def batch_size(self) -> int:
+            return 8
+
+        @property
+        def num_cpus(self) -> int:
+            return 1
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with pytest.raises(RuntimeError):
+            build_or_load_cache(tmpdir, SimpleShardSource(), ThrowingProcessor(), await_finished=True)
+
+
+@pytest.mark.ray
+@pytest.mark.skip("This test segfaults in CI. I think a ray bug")
+def test_shard_cache_fails_with_multiple_shards_with_the_same_name():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with open(f"{tmpdir}/data.txt", "w") as f:
+            f.write("")
+
+        with pytest.raises(ValueError):
+            TextUrlDataSource(
+                [f"{tmpdir}/data.txt", f"{tmpdir}/data.txt"],
+            )
+
+        with open(f"{tmpdir}/data.txt.1", "w") as f:
+            f.write("")
+
+            dataset = TextUrlDataSource(
+                [f"{tmpdir}/data.txt", f"{tmpdir}/data.txt.1"],
+            )
+
+            build_or_load_cache(tmpdir, dataset, TestProcessor(), await_finished=True)
+
+
+@pytest.mark.skip("This test segfaults in CI. I think a ray bug")
+@pytest.mark.ray
+@pytest.mark.asyncio
+async def test_shard_cache_fails_gracefully_with_unknown_file_type_async():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with open(f"{tmpdir}/data.not_a_real_extension", "w") as f:
+            f.write("")
+
+        dataset = TextUrlDataSource(
+            [f"{tmpdir}/data.not_a_real_extension"],
+        )
+
+        with pytest.raises(ValueError):
+            build_or_load_cache(tmpdir, dataset, TestProcessor(), await_finished=True)
+
+        # now make sure it works in non-blocking mode
+
+        cache = build_or_load_cache(tmpdir, dataset, TestProcessor(), await_finished=False)
+
+        with pytest.raises(ValueError):
+            await cache.get_batch([0])
+
+        with pytest.raises(ValueError):
+            cache.await_finished(timeout=10)
+
+        del cache
+
+
+@pytest.mark.skip("This test segfaults in CI. I think a ray bug")
+@pytest.mark.ray
+def test_shard_cache_fails_gracefully_with_unknown_file_type():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with open(f"{tmpdir}/data.not_a_real_extension", "w") as f:
+            f.write("")
+
+        dataset = TextUrlDataSource(
+            [f"{tmpdir}/data.not_a_real_extension"],
+        )
+
+        with pytest.raises(ValueError):
+            build_or_load_cache(tmpdir, dataset, TestProcessor(), await_finished=True)
+
+        # now make sure it works in non-blocking mode
+
+        cache = build_or_load_cache(tmpdir, dataset, TestProcessor(), await_finished=False)
+
+        with pytest.raises(ValueError):
+            cache.get_batch_sync([0])
+
+        with pytest.raises(ValueError):
+            cache.await_finished(timeout=10)
+
+        del cache
+
+
+@pytest.mark.ray
+@pytest.mark.asyncio
+async def test_backpressure_mechanism():
+    parent = PretendParent.remote()
+    exemplar = np.array([1, 2, 3])
+    with tempfile.TemporaryDirectory() as cache_dir:
+        shards = ["shard1", "shard2", "shard3"]
+        writer = _OrderedCacheWriter.remote(
+            parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards, min_items_to_write=1
+        )
+
+        # Simulate batches being processed
+        shard1_batch = [np.array([1, 2, 3])]
+        shard2_batch = [np.array([4, 5, 6])]
+        shard3_batch = [np.array([7, 8, 9])]
+
+        # await writer.batch_finished.remote("shard1", 0, shard1_batch)
+        await writer.batch_finished.remote("shard2", 0, shard2_batch)
+        await writer.batch_finished.remote("shard3", 0, shard3_batch)
+        await writer.batch_finished.remote("shard1", 1, shard3_batch)
+        await writer.batch_finished.remote("shard1", 2, shard3_batch)
+        await writer.batch_finished.remote("shard1", 3, shard3_batch)
+
+        # Check if backpressure is signaled
+        is_overwhelmed = await writer.is_overwhelmed.remote()
+        assert is_overwhelmed is True
+
+        for i in range(4):
+            if (await parent.desired_next_item.remote()) == 0:
+                break
+
+            await asyncio.sleep(0.1 * (i + 1) * (i + 1))
+        else:
+            assert False, "Backpressure wasn't sent"
+
+        await writer.batch_finished.remote("shard1", 0, shard1_batch)
+
+        # Reduce the queue size to relieve backpressure
+        # Check if backpressure is relieved
+        is_overwhelmed = await writer.is_overwhelmed.remote()
+        assert is_overwhelmed is False
+
+        for i in range(4):
+            if (await parent.desired_next_item.remote()) is None:
+                break
+
+            await asyncio.sleep(0.1 * (i + 1) * (i + 1))
+        else:
+            assert False, "Backpressure wasn't relieved"
diff --git a/tests/test_replicated_loader.py b/tests/test_new_loader.py
similarity index 62%
rename from tests/test_replicated_loader.py
rename to tests/test_new_loader.py
index 431a1c0bb..e6f9a3dd7 100644
--- a/tests/test_replicated_loader.py
+++ b/tests/test_new_loader.py
@@ -1,5 +1,5 @@
-import itertools
-from typing import Sequence
+import asyncio
+from typing import Optional, Sequence
 
 import jax
 import numpy as np
@@ -9,26 +9,16 @@
 from haliax import Axis
 from haliax.partitioning import ResourceAxis
 
-import levanter.data
-from levanter.data.loader import ReplicatedBatchLoader, check_sharded_consistency
-from test_utils import skip_if_not_enough_devices
+from levanter.data.dataset import AsyncDataset, ListAsyncDataset
+from levanter.data.loader import DataLoader, check_sharded_consistency
 
+from .test_utils import skip_if_not_enough_devices
 
-def _small_dataset(seq_len=128, num_sequences=200) -> levanter.data.ShardableDataset[Sequence[int]]:
-    class SequenceDataset(levanter.data.ShardableDataset[np.ndarray]):
-        def __init__(self, sequences: Sequence[np.ndarray]):
-            self.sequences = sequences
 
-        def shard(self, shard_idx: int, num_shards: int) -> levanter.data.ShardableDataset[np.ndarray]:
-            return SequenceDataset(self.sequences[shard_idx::num_shards])
-
-        def __iter__(self):
-            yield from self.sequences
-
-    # sequences = [list(range(i * 1000, i * 1000 + seq_len)) for i in range(num_sequences)]
+def _small_dataset(seq_len=128, num_sequences=200) -> AsyncDataset[Sequence[int]]:
     sequences = [np.arange(seq_len) + 1000 * i for i in range(num_sequences)]
 
-    return SequenceDataset(sequences)
+    return ListAsyncDataset(sequences, is_complete=True)
 
 
 @skip_if_not_enough_devices(2)
@@ -45,9 +35,9 @@ def test_local_batched_data_loading_model_axis_2():
         seq_len = 128
         cache = _small_dataset(seq_len)
         Batch = Axis("batch", len(devices))
-        loader = ReplicatedBatchLoader(cache, mesh, Batch)
+        loader = DataLoader(Batch, cache, max_buffered_batches=10, mesh=mesh, axis_resources=None)
 
-        batches = list(itertools.islice(loader, 10))
+        batches = list(loader)
         for batch in batches:
             check_sharded_consistency(batch, check_disjoint_indices_are_different=True)
 
@@ -65,36 +55,46 @@ def test_local_batched_data_loading_model_axis_1():
         seq_len = 128
         cache = _small_dataset(seq_len)
         Batch = Axis("batch", len(devices))
-        loader = ReplicatedBatchLoader(cache, mesh, Batch)
+        loader = DataLoader(Batch, cache, max_buffered_batches=10, mesh=mesh, axis_resources=None)
 
-        batches = list(itertools.islice(loader, 10))
+        batches = list(loader)
         for batch in batches:
             check_sharded_consistency(batch, check_disjoint_indices_are_different=True)
 
 
-class StructuredDataset(levanter.data.ShardableDataset):
-    def __init__(self, seq_len, begin, end, stride):
+class StructuredDataset(AsyncDataset):
+    def __init__(self, seq_len):
         self.seq_len = seq_len
-        self.begin = begin
-        self.end = end
-        self.stride = stride
+        self.begin = 0
+        self.end = 256
+        self.stride = 1
+
+    async def async_len(self) -> int:
+        return (self.end - self.begin) // self.stride
 
-    def __getitem__(self, item):
+    async def getitem_async(self, index: int) -> dict:
+        index = self.begin + index * self.stride
         return {
-            "input_ids": np.arange(self.seq_len, dtype=np.int32) + item * 1000,
-            "labels": np.arange(self.seq_len, dtype=np.int32) + item * 1000,
+            "input_ids": np.arange(self.seq_len, dtype=np.int32) + index * 1000,
+            "labels": np.arange(self.seq_len, dtype=np.int32) + index * 1000,
             "extra": {
-                "input_ids": np.arange(self.seq_len, dtype=np.int32) + item * 1000,
-                "mask": np.arange(self.seq_len * 2, dtype=np.int32).reshape(-1, 2) + item * 1000,
+                "input_ids": np.arange(self.seq_len, dtype=np.int32) + index * 1000,
+                "mask": np.arange(self.seq_len * 2, dtype=np.int32).reshape(-1, 2) + index * 1000,
             },
         }
 
-    def __iter__(self):
-        for i in range(self.begin, self.end, self.stride):
-            yield self[i]
+    async def final_length_is_known(self) -> bool:
+        return True
+
+    def is_finite(self) -> bool:
+        return True
 
-    def shard(self, shard_id: int, num_shards: int):
-        return StructuredDataset(self.seq_len, self.begin + shard_id, self.end, self.stride * num_shards)
+    async def current_len(self) -> Optional[int]:
+        return await self.async_len()
+
+    async def get_batch(self, indices: Sequence[int]):
+        out = await asyncio.gather(*(self.getitem_async(i) for i in indices))
+        return out
 
 
 def test_structured_batches_model_axis_1():
@@ -107,11 +107,11 @@ def test_structured_batches_model_axis_1():
     )
     with mesh, haliax.axis_mapping({"batch": ResourceAxis.DATA}):
         seq_len = 128
-        dataset = StructuredDataset(seq_len, 0, 256, 1)
+        dataset = StructuredDataset(seq_len)
         Batch = Axis("batch", len(devices))
-        loader = ReplicatedBatchLoader(dataset, mesh, Batch)
+        loader = DataLoader(Batch, dataset, max_buffered_batches=10, mesh=mesh, axis_resources=None)
 
-        batches = list(itertools.islice(loader, 10))
+        batches = list(loader)
         for batch in batches:
             check_sharded_consistency(batch, check_disjoint_indices_are_different=True)
 
@@ -127,16 +127,16 @@ def test_structured_batches_model_axis_2():
     )
     with mesh, haliax.axis_mapping({"batch": ResourceAxis.DATA}):
         seq_len = 128
-        dataset = StructuredDataset(seq_len, 0, 256, 1)
+        dataset = StructuredDataset(seq_len)
         Batch = Axis("batch", len(devices))
-        loader = ReplicatedBatchLoader(dataset, mesh, Batch)
+        loader = DataLoader(Batch, dataset, max_buffered_batches=10, mesh=mesh, axis_resources=None)
 
-        batches = list(itertools.islice(loader, 10))
+        batches = list(loader)
         for batch in batches:
             check_sharded_consistency(batch, check_disjoint_indices_are_different=True)
 
 
-class StructuredDatasetWithNames(levanter.data.ShardableDataset):
+class StructuredDatasetWithNames(AsyncDataset):
     def __init__(self, Height: Axis, Width: Axis, begin, end, stride):
         self.Height = Height
         self.Width = Width
@@ -144,6 +144,33 @@ def __init__(self, Height: Axis, Width: Axis, begin, end, stride):
         self.end = end
         self.stride = stride
 
+    async def final_length_is_known(self) -> bool:
+        return True
+
+    def is_finite(self) -> bool:
+        return True
+
+    async def current_len(self) -> Optional[int]:
+        return True
+
+    async def get_batch(self, indices: Sequence[int]):
+        out = await asyncio.gather(*(self.getitem_async(i) for i in indices))
+        return out
+
+    async def async_len(self) -> int:
+        return (self.end - self.begin) // self.stride
+
+    async def getitem_async(self, index: int) -> dict:
+        index = self.begin + index * self.stride
+        return {
+            "input_ids": self._gen_image(index),
+            "labels": self._gen_image(index),
+            "extra": {
+                "input_ids": self._gen_image(index),
+                "mask": haliax.arange(self.Height) + index * 1000,
+            },
+        }
+
     def _gen_image(self, index):
         image = (
             np.arange(self.Height.size * self.Width.size, dtype=np.int32).reshape(self.Height.size, self.Width.size)
@@ -152,25 +179,10 @@ def _gen_image(self, index):
 
         return haliax.named(image, (self.Height, self.Width))
 
-    def __getitem__(self, item):
-        return {
-            "input_ids": self._gen_image(item),
-            "labels": self._gen_image(item),
-            "extra": {
-                "input_ids": self._gen_image(item),
-                "mask": haliax.arange(self.Height) + item * 1000,
-            },
-        }
-
     def __iter__(self):
         for i in range(self.begin, self.end, self.stride):
             yield self[i]
 
-    def shard(self, shard_id: int, num_shards: int):
-        return StructuredDatasetWithNames(
-            self.Height, self.Width, self.begin + shard_id, self.end, self.stride * num_shards
-        )
-
 
 def test_structured_batches_model_axis_1_with_names():
     devices = jax.devices()
@@ -183,11 +195,11 @@ def test_structured_batches_model_axis_1_with_names():
     with mesh, haliax.axis_mapping({"batch": ResourceAxis.DATA}):
         Height = Axis("Height", 16)
         Width = Axis("Width", 16)
-        dataset = StructuredDatasetWithNames(Height, Width, 0, 256, 1)
+        dataset = StructuredDatasetWithNames(Height, Width, 0, len(devices) * 10, 1)
         Batch = Axis("batch", len(devices))
-        loader = ReplicatedBatchLoader(dataset, mesh, Batch)
+        loader = DataLoader(Batch, dataset, max_buffered_batches=10, mesh=mesh, axis_resources=None)
 
-        batches = list(itertools.islice(loader, 10))
+        batches = list(loader)
         for batch in batches:
             check_sharded_consistency(batch, check_disjoint_indices_are_different=True)
 
@@ -208,9 +220,9 @@ def test_structured_batches_model_axis_2_with_names():
         Width = Axis("Width", 16)
         dataset = StructuredDatasetWithNames(Height, Width, 0, 256, 1)
         Batch = Axis("batch", len(devices))
-        loader = ReplicatedBatchLoader(dataset, mesh, Batch)
+        loader = DataLoader(Batch, dataset, max_buffered_batches=10, mesh=mesh, axis_resources=None)
 
-        batches = list(itertools.islice(loader, 10))
+        batches = list(loader)
         for batch in batches:
             check_sharded_consistency(batch, check_disjoint_indices_are_different=True)
 
@@ -230,8 +242,7 @@ def test_structured_batches_model_axis_2_subsharded():
     with mesh, haliax.axis_mapping({"batch": ResourceAxis.DATA, Height.name: ResourceAxis.MODEL}):
         dataset = StructuredDatasetWithNames(Height, Width, 0, 256, 1)
         Batch = Axis("batch", len(devices))
-        loader = ReplicatedBatchLoader(dataset, mesh, Batch)
+        loader = DataLoader(Batch, dataset, max_buffered_batches=10, mesh=mesh, axis_resources=None)
 
-        batches = list(itertools.islice(loader, 10))
-        for batch in batches:
+        for batch in iter(loader):
             check_sharded_consistency(batch, check_disjoint_indices_are_different=True)
diff --git a/tests/test_newdataset.py b/tests/test_newdataset.py
new file mode 100644
index 000000000..030095b41
--- /dev/null
+++ b/tests/test_newdataset.py
@@ -0,0 +1,142 @@
+import asyncio
+
+import jax.random
+import pytest
+
+from levanter.data import EraShufflingDataset, PermutationDataset
+from levanter.data.dataset import ListAsyncDataset
+
+
+@pytest.mark.asyncio
+async def test_length_of_sequence_dataset_is_accurate():
+    data = [1, 2, 3]
+    dataset = ListAsyncDataset(data)
+    assert (await dataset.current_len()) == 3
+    assert not (await dataset.final_length_is_known())
+    dataset.finalize()
+    assert (await dataset.current_len()) == 3
+    assert await dataset.final_length_is_known()
+    assert (await dataset.async_len()) == 3
+
+
+@pytest.mark.asyncio
+async def test_list_dataset_get_item_returns_correct_item():
+    data = ["a", "b", "c"]
+    dataset = ListAsyncDataset(data)
+    assert await dataset.getitem_async(1) == "b"
+
+
+@pytest.mark.asyncio
+async def test_list_async_dataset_appends_and_finalizes_correctly():
+    dataset = ListAsyncDataset([])
+    dataset.append("a")
+    dataset.finalize()
+    assert await dataset.async_len() == 1
+    assert await dataset.get_batch([0]) == ["a"]
+
+
+@pytest.mark.asyncio
+async def test_permutation_dataset_is_at_least_sometimes_permuted():
+    for seed in range(10):
+        data = [1, 2, 3, 4]
+        dataset = ListAsyncDataset(data, is_complete=True)
+        permuted_dataset = PermutationDataset(dataset, jax.random.PRNGKey(seed))
+        if await permuted_dataset.get_batch([0, 1, 2, 3]) != [1, 2, 3, 4]:
+            return
+
+    pytest.fail("PermutationDataset did not permute the data")
+
+
+@pytest.mark.asyncio
+async def test_era_shuffling_dataset_returns_correct_length():
+    data = list(range(100))
+    dataset = ListAsyncDataset(data, is_complete=False)
+    era_length = 10
+    key = jax.random.PRNGKey(0)
+    shuffling_dataset = EraShufflingDataset(dataset, era_length, key=key)
+    assert await shuffling_dataset.current_len() == 100
+    assert not await shuffling_dataset.final_length_is_known()
+
+    dataset.append(1)
+    assert await shuffling_dataset.current_len() == 100
+
+
+@pytest.mark.asyncio
+async def test_era_shuffling_dataset_get_batch_returns_shuffled_batch():
+    data = list(range(20))
+    dataset = ListAsyncDataset(data)
+    dataset.finalize()
+    era_length = 5
+    key = jax.random.PRNGKey(0)
+    shuffling_dataset = EraShufflingDataset(dataset, era_length, key=key)
+    batch_indices = [0, 1, 2, 3, 4]
+    batch = await shuffling_dataset.get_batch(batch_indices)
+    assert set(batch) == set([0, 1, 2, 3, 4])  # Ensures all elements are from the first era but does not assume order
+    assert batch != [0, 1, 2, 3, 4]  # Ensures the batch is shuffled
+
+
+@pytest.mark.asyncio
+async def test_era_shuffling_can_grow():
+    data = list(range(5))
+    dataset = ListAsyncDataset(data)
+    era_length = 5
+    key = jax.random.PRNGKey(0)
+    shuffling_dataset = EraShufflingDataset(dataset, era_length, key=key)
+    batch_indices = [0, 1, 2, 3, 4]
+    batch = await shuffling_dataset.get_batch(batch_indices)
+    assert set(batch) == set([0, 1, 2, 3, 4])
+
+    for i in range(5):
+        dataset.append(i + 5)
+
+    assert await shuffling_dataset.current_len() == 10
+    assert not await shuffling_dataset.final_length_is_known()
+    batch = await shuffling_dataset.get_batch(list(range(10)))
+
+    assert set(batch) == set(range(10))
+    assert set(batch[0:5]) == set([0, 1, 2, 3, 4])
+    assert set(batch[5:10]) == set([5, 6, 7, 8, 9])
+
+    # now make sure that we can await data and it does get fulfilled
+    # this should timeout if we try to await it
+    coro = dataset.get_batch([11])
+    try:
+        await asyncio.wait_for(coro, timeout=0.1)
+        pytest.fail("Should have timed out")
+    except asyncio.TimeoutError:
+        pass
+
+    async def append_data():
+        await asyncio.sleep(0.1)
+        for i in range(10, 15):
+            dataset.append(i)
+
+    coro = dataset.getitem_async(11)
+
+    _, r = await asyncio.gather(append_data(), coro)
+    assert r in range(10, 15)
+
+    coro2 = shuffling_dataset.wait_until_len_at_least(20)
+    with pytest.raises(asyncio.TimeoutError):
+        await asyncio.wait_for(coro2, timeout=0.1)
+
+    assert await shuffling_dataset.current_len() == 15
+
+    coro2 = shuffling_dataset.wait_until_len_at_least(20)
+    dataset.append(15)
+    with pytest.raises(asyncio.TimeoutError):
+        await asyncio.wait_for(coro2, timeout=0.1)
+
+    assert await shuffling_dataset.current_len() == 15
+
+    coro2 = shuffling_dataset.wait_until_len_at_least(20)
+    dataset.finalize()
+    await asyncio.wait_for(coro2, timeout=0.1)
+
+    assert await dataset.async_len() == 16
+    assert await shuffling_dataset.current_len() == 16
+
+    coro = shuffling_dataset.get_batch(list(range(16)))
+
+    batch = await coro
+    assert set(batch) == set(range(16))
diff --git a/tests/test_prp.py b/tests/test_prp.py
new file mode 100644
index 000000000..6c549eabf
--- /dev/null
+++ b/tests/test_prp.py
@@ -0,0 +1,87 @@
+import jax.numpy as jnp
+import jax.random as jrandom
+import pytest
+
+from levanter.data._prp import Permutation
+
+
+def test_permutation_creates_valid_instance():
+    length = 100
+    prng_key = jrandom.PRNGKey(0)
+    permutation = Permutation(length, prng_key)
+    assert permutation.length == length
+    assert permutation._a > 0 and permutation._a < length
+    assert permutation._b >= 0 and permutation._b < length
+
+
+def test_permutation_with_single_index_returns_correct_value():
+    length = 10
+    prng_key = jrandom.PRNGKey(0)
+    permutation = Permutation(length, prng_key)
+    index = 5
+    result = permutation(index)
+    assert isinstance(result, int)
+    assert result != index  # In most cases, result should not equal the input for a permutation
+
+
+def test_permutation_with_array_returns_correct_values():
+    length = 10
+    prng_key = jrandom.PRNGKey(0)
+    permutation = Permutation(length, prng_key)
+    indices = jnp.arange(length)
+    results = permutation(indices)
+    assert isinstance(results, jnp.ndarray)
+    assert len(results) == length
+    assert jnp.sum(results == indices) <= 2
+
+
+def test_permutation_is_bijective_over_full_range():
+    length = 10
+    prng_key = jrandom.PRNGKey(0)
+    permutation = Permutation(length, prng_key)
+    indices = jnp.arange(length)
+    permuted = permutation(indices)
+    # Check if all elements are unique, which is a necessary condition for a bijective function
+    assert len(jnp.unique(permuted)) == length
+
+
+def test_permutation_handles_edge_case_length_one():
+    length = 1
+    prng_key = jrandom.PRNGKey(0)
+    permutation = Permutation(length, prng_key)
+    result = permutation(0)
+    assert result == 0  # With length 1, the only valid output is the input it
+
+
+def test_permutation_rejects_invalid_indices():
+    length = 10
+    prng_key = jrandom.PRNGKey(0)
+    permutation = Permutation(length, prng_key)
+    with pytest.raises(IndexError):
+        permutation(-1)  # Test negative index
+    with pytest.raises(IndexError):
+        permutation(length)  # Test index equal to length
+
+
+def test_permutation_is_deterministic():
+    length = 4
+    prng_key = jrandom.PRNGKey(0)
+    permutation = Permutation(length, prng_key)
+    indices = jnp.arange(length)
+    results = permutation(indices)
+    prng_key = jrandom.PRNGKey(0)
+    permutation = Permutation(length, prng_key)
+    results2 = permutation(indices)
+    assert jnp.all(results == results2)
+
+
+def test_permutation_is_deterministic1():
+    length = 4
+    prng_key = jrandom.PRNGKey(1)
+    permutation = Permutation(length, prng_key)
+    indices = jnp.arange(length)
+    results = permutation(indices)
+    prng_key = jrandom.PRNGKey(1)
+    permutation = Permutation(length, prng_key)
+    results2 = permutation(indices)
+    assert jnp.all(results == results2)
diff --git a/tests/test_shard_cache.py b/tests/test_shard_cache.py
deleted file mode 100644
index 7500307db..000000000
--- a/tests/test_shard_cache.py
+++ /dev/null
@@ -1,383 +0,0 @@
-import asyncio
-import tempfile
-from typing import Iterator, List, Sequence
-
-import pyarrow as pa
-import pytest
-import ray
-
-from levanter.data._preprocessor import BatchProcessor
-from levanter.data.shard_cache import ChunkMetadata, SerialCacheWriter, _get_broker_actor, build_or_load_cache
-from levanter.data.sharded_dataset import ShardedDataset, TextUrlDataset
-from levanter.utils.py_utils import logical_cpu_core_count
-from test_utils import skip_in_ci
-
-
-def setup_module(module):
-    ray.init("local", num_cpus=max(2 * logical_cpu_core_count(), 8))  # 2x cpu count is faster on my m1
-
-
-def teardown_module(module):
-    ray.shutdown()
-
-
-# tests to write:
-# - test idempotency of writes
-
-
-class TestProcessor(BatchProcessor[Sequence[int]]):
-    def __init__(self, batch_size: int = 8):
-        self._batch_size = batch_size
-
-    def __call__(self, batch: Sequence[Sequence[int]]) -> pa.RecordBatch:
-        return pa.RecordBatch.from_arrays([pa.array(batch)], ["test"])
-
-    @property
-    def batch_size(self) -> int:
-        return self._batch_size
-
-    @property
-    def num_cpus(self) -> int:
-        return 1
-
-
-class SimpleShardSource(ShardedDataset[List[int]]):
-    def __init__(self, num_shards: int = 4):
-        self._num_shards = num_shards
-
-    @property
-    def shard_names(self) -> Sequence[str]:
-        return [f"shard_{i}" for i in range(self._num_shards)]
-
-    def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[List[int]]:
-        # parse the shard name to get the shard number
-        shard_num = int(shard_name.split("_")[1])
-        return ([shard_num * 10 + i] * 10 for i in range(row, 10))
-
-
-def simple_process(processor, source):
-    result = []
-    for shard_name in source.shard_names:
-        for batch in source.open_shard(shard_name):
-            result.append(processor([batch]))
-
-    return result
-
-
-@pytest.mark.ray
-@pytest.mark.parametrize("shards_to_read_at_once", [1, 2, 4])
-def test_cache_simple(shards_to_read_at_once):
-    td = tempfile.TemporaryDirectory()
-    with td as tmpdir:
-        ray_ds = build_or_load_cache(
-            tmpdir,
-            SimpleShardSource(),
-            TestProcessor(),
-            await_finished=True,
-            # shards_to_read_at_once=shards_to_read_at_once,
-        )
-
-        simple_processed = simple_process(TestProcessor(), SimpleShardSource())
-
-        assert list(ray_ds) == list(simple_processed)
-
-
-@pytest.mark.ray
-def test_cache_remembers_its_cached():
-    directory = tempfile.TemporaryDirectory()
-    with directory as tmpdir:
-        ds1 = build_or_load_cache(tmpdir, SimpleShardSource(), TestProcessor())
-
-        class ThrowingProcessor(BatchProcessor[Sequence[int]]):
-            def __call__(self, batch: Sequence[Sequence[int]]) -> pa.RecordBatch:
-                raise RuntimeError("This should not be called")
-
-            @property
-            def batch_size(self) -> int:
-                return 8
-
-            @property
-            def num_cpus(self) -> int:
-                return 1
-
-        # testing this doesn't throw
-        ds2 = build_or_load_cache(tmpdir, SimpleShardSource(), ThrowingProcessor(), await_finished=True)
-
-        assert list(ds1) == list(ds2)
-        # ensure we delete tmpdir, since something is holding onto it
-
-
-class _CustomException(Exception):
-    pass
-
-
-@pytest.mark.ray
-@skip_in_ci
-def test_cache_recover_from_crash():
-    class CrashingShardSource(ShardedDataset[List[int]]):
-        def __init__(self, crash_point: int):
-            self.crash_point = crash_point
-
-        @property
-        def shard_names(self) -> Sequence[str]:
-            return [f"shard_{i}" for i in range(4)]
-
-        def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[List[int]]:
-            # parse the shard name to get the shard number
-            shard_num = int(shard_name.split("_")[1])
-            for i in range(10):
-                if shard_num * 10 + i == self.crash_point:
-                    raise _CustomException(f"Crashing at {shard_num} {i} {self.crash_point}")
-                if i >= row:
-                    yield [shard_num * 10 + i] * 10
-
-    with tempfile.TemporaryDirectory() as tmpdir, tempfile.TemporaryDirectory() as tmpdir2:
-        source = CrashingShardSource(4)
-        with pytest.raises(_CustomException):
-            build_or_load_cache(tmpdir, source, TestProcessor())
-
-        # kill the broker actor so that we can test recovery
-        ray.kill(_get_broker_actor(tmpdir, source, TestProcessor()), no_restart=True)
-
-        source = CrashingShardSource(5)
-        with pytest.raises(_CustomException):
-            build_or_load_cache(tmpdir, source, TestProcessor())
-
-        ray.kill(_get_broker_actor(tmpdir, source, TestProcessor()), no_restart=True)
-
-        # testing this doesn't throw
-        source = CrashingShardSource(1000)
-        reader1 = build_or_load_cache(tmpdir, source, TestProcessor(), batch_size=1, await_finished=True)
-
-        # compare to the original with no crash
-        reader2 = build_or_load_cache(tmpdir2, SimpleShardSource(), TestProcessor(), batch_size=1, await_finished=True)
-
-        assert list(reader1) == list(reader2)
-        assert len(list(reader1)) == 40
-
-
-@pytest.mark.ray
-def test_no_hang_if_empty_shard_source():
-    class EmptyShardSource(ShardedDataset[List[int]]):
-        @property
-        def shard_names(self) -> Sequence[str]:
-            return []
-
-        def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[List[int]]:
-            raise RuntimeError("This should not be called")
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        reader = build_or_load_cache(tmpdir, EmptyShardSource(), TestProcessor(), batch_size=1)
-        assert list(reader) == []
-
-
-@skip_in_ci
-@pytest.mark.ray
-def test_chunk_ordering_is_correct_with_slow_shards():
-    class SlowShardSource(ShardedDataset[List[int]]):
-        @property
-        def shard_names(self) -> Sequence[str]:
-            return ["shard_0", "shard_1"]
-
-        def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[List[int]]:
-            max_count = 40 if shard_name == "shard_1" else 20
-            for i in range(0, max_count):
-                yield [i] * 10
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        cache = build_or_load_cache(
-            tmpdir,
-            SlowShardSource(),
-            TestProcessor(1),
-            batch_size=1,
-            rows_per_chunk=10,
-            await_finished=False,
-        )
-
-        # now block until the cache is done
-        cache.await_finished(timeout=10)
-
-        # now check that the chunks are in the right order
-        # TODO: this is a bit gross
-        chunks: List[ChunkMetadata] = ray.get([cache._broker.get_chunk.remote(i) for i in range(6)])
-        assert chunks[0].name == "shard_0/chunk-0"
-        assert chunks[1].name == "shard_1/chunk-0"
-        assert chunks[2].name == "shard_0/chunk-1"
-        assert chunks[3].name == "shard_1/chunk-1"
-        assert chunks[4].name == "shard_1/chunk-2"
-        assert chunks[5].name == "shard_1/chunk-3"
-
-        # make sure there's not a 7th chunk
-        chunk = ray.get(cache._broker.get_chunk.remote(6), timeout=0.5)
-        assert chunk is None
-
-
-@skip_in_ci
-@pytest.mark.ray
-def test_can_get_chunk_before_finished():
-    @ray.remote(num_cpus=0)
-    class Blocker:
-        def __init__(self):
-            self.future = asyncio.Future()
-
-        async def block(self):
-            await self.future
-
-        def unblock(self):
-            self.future.set_result(None)
-
-    blocker_to_wait_on_test = Blocker.remote()
-
-    class SlowShardSource(ShardedDataset[List[int]]):
-        @property
-        def shard_names(self) -> Sequence[str]:
-            return ["shard_0"]
-
-        def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[List[int]]:
-            for i in range(10):
-                yield [i] * 10
-            ray.get(blocker_to_wait_on_test.block.remote())
-            for i in range(10, 20):
-                yield [i] * 10
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        cache = build_or_load_cache(
-            tmpdir, SlowShardSource(), TestProcessor(5), batch_size=1, rows_per_chunk=10, await_finished=False
-        )
-
-        def back_to_py(batch: pa.RecordBatch):
-            return list(batch["test"].values.to_numpy())
-
-        chunk = [back_to_py(batch) for batch in cache.read_chunk(0)]
-
-        assert [list(x) for x in chunk] == [[i] * 10 for i in range(10)]
-
-        with pytest.raises(TimeoutError):
-            cache.get_chunk(1, timeout=0.1)
-
-        ray.get(blocker_to_wait_on_test.unblock.remote())
-
-        chunk = [back_to_py(batch) for batch in cache.read_chunk(1)]
-
-        assert [list(x) for x in chunk] == [[i] * 10 for i in range(10, 20)]
-
-        ray.get(blocker_to_wait_on_test.block.remote())
-
-        # now wait until the cache is finished. mostly so that the tempdir cleanup works
-        cache.await_finished(timeout=10)
-
-
-@skip_in_ci
-@pytest.mark.ray
-def test_shard_cache_crashes_if_processor_throws():
-    class ThrowingProcessor(BatchProcessor[Sequence[int]]):
-        def __call__(self, batch: Sequence[Sequence[int]]) -> pa.RecordBatch:
-            raise RuntimeError("exc")
-
-        @property
-        def batch_size(self) -> int:
-            return 8
-
-        @property
-        def num_cpus(self) -> int:
-            return 1
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        with pytest.raises(RuntimeError):
-            build_or_load_cache(tmpdir, SimpleShardSource(), ThrowingProcessor(), await_finished=True)
-
-
-@skip_in_ci
-@pytest.mark.ray
-def test_map_batches_and_map_shard_cache():
-    td = tempfile.TemporaryDirectory()
-    with td as tmpdir:
-        ray_ds = (
-            SimpleShardSource()
-            .map(lambda list: list * 2)
-            .map_batches(TestProcessor(), 8)
-            .map(lambda d: {"q": d["test"]})
-            .build_or_load_cache(tmpdir, await_finished=True)
-        )
-
-        def composite_fn(list):
-            assert len(list) == 1
-            return {"q": list[0] * 2}
-
-        simple_processed = simple_process(composite_fn, SimpleShardSource())
-
-        # we internally change all the int lists in the ray_ds to np arrays, so we need to convert them back to lists
-        ray_entries = []
-        for entry in ray_ds:
-            assert entry.keys() == {"q"}
-            ray_entries.append({"q": entry["q"].tolist()})
-
-        assert ray_entries == list(simple_processed)
-
-
-@pytest.mark.ray
-def test_serial_cache_writer():
-    with tempfile.TemporaryDirectory() as tmpdir1, tempfile.TemporaryDirectory() as tmpdir2:
-        source = SimpleShardSource(num_shards=4)
-        processor = TestProcessor()
-
-        with SerialCacheWriter(tmpdir1, rows_per_chunk=8) as writer:
-            for shard_name in source.shard_names:
-                for batch in source.open_shard(shard_name):
-                    writer.write_batch(processor([batch]))
-
-        serial = writer.result(batch_size=1)
-        ray_ds = build_or_load_cache(tmpdir2, source, processor, await_finished=True)
-
-        def freeze_batch(batch):
-            # make it hashable
-            return tuple(batch["test"].values.to_numpy())
-
-        assert set(freeze_batch(batch) for batch in serial) == set(freeze_batch(batch) for batch in ray_ds)
-
-
-@skip_in_ci
-@pytest.mark.ray
-def test_shard_cache_fails_with_multiple_shards_with_the_same_name():
-    with tempfile.TemporaryDirectory() as tmpdir:
-        with open(f"{tmpdir}/data.txt", "w") as f:
-            f.write("")
-
-        with pytest.raises(ValueError):
-            TextUrlDataset(
-                [f"{tmpdir}/data.txt", f"{tmpdir}/data.txt"],
-            )
-
-        with open(f"{tmpdir}/data.txt.1", "w") as f:
-            f.write("")
-
-            dataset = TextUrlDataset(
-                [f"{tmpdir}/data.txt", f"{tmpdir}/data.txt.1"],
-            )
-
-            build_or_load_cache(tmpdir, dataset, TestProcessor(), await_finished=True)
-
-
-@skip_in_ci
-@pytest.mark.ray
-def test_shard_cache_fails_gracefully_with_unknown_file_type():
-    with tempfile.TemporaryDirectory() as tmpdir:
-        with open(f"{tmpdir}/data.not_a_real_extension", "w") as f:
-            f.write("")
-
-        dataset = TextUrlDataset(
-            [f"{tmpdir}/data.not_a_real_extension"],
-        )
-
-        with pytest.raises(ValueError):
-            build_or_load_cache(tmpdir, dataset, TestProcessor(), await_finished=True)
-
-        # now make sure it works in non-blocking mode
-
-        cache = build_or_load_cache(tmpdir, dataset, TestProcessor(), await_finished=False)
-
-        with pytest.raises(ValueError):
-            cache.get_chunk(0, timeout=5)
-
-        with pytest.raises(ValueError):
-            cache.await_finished(timeout=10)
diff --git a/tests/test_sharded_dataset.py b/tests/test_sharded_dataset.py
index 5183c55a4..b3c8bcc8d 100644
--- a/tests/test_sharded_dataset.py
+++ b/tests/test_sharded_dataset.py
@@ -1,6 +1,6 @@
 import tempfile
 
-from levanter.data.sharded_dataset import AudioTextUrlDataset, _sniff_format_for_dataset
+from levanter.data.sharded_datasource import AudioTextUrlDataSource, _sniff_format_for_dataset
 from test_utils import skip_if_no_soundlibs
 
 
@@ -26,4 +26,4 @@ def test_sniff_format_for_json():
 
 @skip_if_no_soundlibs
 def test_resolve_audio_pointer():
-    AudioTextUrlDataset.resolve_audio_pointer("https://ccrma.stanford.edu/~jos/mp3/trumpet.mp3", 16_000)
+    AudioTextUrlDataSource.resolve_audio_pointer("https://ccrma.stanford.edu/~jos/mp3/trumpet.mp3", 16_000)
diff --git a/tests/test_sharded_loader.py b/tests/test_sharded_loader.py
deleted file mode 100644
index ec46fb6a6..000000000
--- a/tests/test_sharded_loader.py
+++ /dev/null
@@ -1,299 +0,0 @@
-import itertools
-from typing import Sequence
-
-import jax
-import jax.numpy as jnp
-import numpy as np
-from jax.sharding import Mesh
-
-import haliax as hax
-from haliax import Axis
-from haliax.partitioning import ResourceAxis
-
-import levanter.data
-from levanter.data.loader import ShardedBatchLoader, check_sharded_consistency
-from test_utils import skip_if_not_enough_devices
-
-
-NUM_SHARDS_TINY = 16
-
-
-def _small_dataset(seq_len=128, num_sequences=200) -> levanter.data.ShardableDataset[Sequence[int]]:
-    class SequenceDataset(levanter.data.ShardableDataset[np.ndarray]):
-        def __init__(self, sequences: Sequence[np.ndarray]):
-            self.sequences = sequences
-
-        def shard(self, shard_idx: int, num_shards: int) -> levanter.data.ShardableDataset[np.ndarray]:
-            return SequenceDataset(self.sequences[shard_idx::num_shards])
-
-        def __iter__(self):
-            yield from self.sequences
-
-    # sequences = [list(range(i * 1000, i * 1000 + seq_len)) for i in range(num_sequences)]
-    sequences = [np.arange(seq_len) + 1000 * i for i in range(num_sequences)]
-
-    return SequenceDataset(sequences)
-
-
-@skip_if_not_enough_devices(2)
-def test_sharded_data_loading_model_axis_2():
-    devices = jax.devices()
-    model_axis_size = 2
-
-    mesh = Mesh(
-        np.array(devices).reshape(1, -1, model_axis_size),
-        (ResourceAxis.REPLICA, ResourceAxis.DATA, ResourceAxis.MODEL),
-    )
-    with mesh, hax.axis_mapping({"batch": (ResourceAxis.REPLICA, ResourceAxis.DATA)}):
-        seq_len = 128
-        cache = _small_dataset(seq_len)
-        Batch = Axis("batch", len(devices))
-        loader = ShardedBatchLoader(cache, mesh, Batch)
-
-        batches = list(itertools.islice(loader, 10))
-        for batch in batches:
-            check_sharded_consistency(batch, check_disjoint_indices_are_different=True)
-
-
-def test_sharded_data_loading_model_axis_1():
-    devices = jax.devices()
-    model_axis_size = 1
-
-    mesh = Mesh(
-        np.array(devices).reshape(1, -1, model_axis_size),
-        (ResourceAxis.REPLICA, ResourceAxis.DATA, ResourceAxis.MODEL),
-    )
-    with mesh, hax.axis_mapping({"batch": (ResourceAxis.REPLICA, ResourceAxis.DATA)}):
-        seq_len = 128
-        cache = _small_dataset(seq_len)
-        Batch = Axis("batch", len(devices))
-        loader = ShardedBatchLoader(cache, mesh, Batch)
-
-        batches = list(itertools.islice(loader, 10))
-        for batch in batches:
-            check_sharded_consistency(batch, check_disjoint_indices_are_different=True)
-
-
-class StructuredDataset(levanter.data.ShardableDataset):
-    def __init__(self, seq_len, begin, end, stride):
-        self.seq_len = seq_len
-        self.begin = begin
-        self.end = end
-        self.stride = stride
-
-    def __getitem__(self, item):
-        return {
-            "input_ids": np.arange(self.seq_len, dtype=np.int32) + item * 1000,
-            "labels": np.arange(self.seq_len, dtype=np.int32) + item * 1000,
-            "extra": {
-                "input_ids": np.arange(self.seq_len, dtype=np.int32) + item * 1000,
-                "mask": np.arange(self.seq_len * 2, dtype=np.int32).reshape(-1, 2) + item * 1000,
-            },
-        }
-
-    def __iter__(self):
-        for i in range(self.begin, self.end, self.stride):
-            yield self[i]
-
-    def shard(self, shard_id: int, num_shards: int):
-        return StructuredDataset(self.seq_len, self.begin + shard_id, self.end, self.stride * num_shards)
-
-
-def test_structured_batches_model_axis_1():
-    devices = jax.devices()
-    model_axis_size = 1
-
-    mesh = Mesh(
-        np.array(devices).reshape(1, -1, model_axis_size),
-        (ResourceAxis.REPLICA, ResourceAxis.DATA, ResourceAxis.MODEL),
-    )
-    with mesh, hax.axis_mapping({"batch": (ResourceAxis.REPLICA, ResourceAxis.DATA)}):
-        seq_len = 128
-        dataset = StructuredDataset(seq_len, 0, 256, 1)
-        Batch = Axis("batch", len(devices))
-        loader = ShardedBatchLoader(dataset, mesh, Batch)
-
-        batches = list(itertools.islice(loader, 10))
-        for batch in batches:
-            check_sharded_consistency(batch, check_disjoint_indices_are_different=True)
-
-
-class ScalarDataset(levanter.data.ShardableDataset[hax.NamedArray]):
-    def __init__(self, begin, end, stride):
-        self.begin = begin
-        self.end = end
-        self.stride = stride
-
-    def __getitem__(self, item):
-        return hax.named(jnp.array(item), ())
-
-    def __iter__(self):
-        for i in range(self.begin, self.end, self.stride):
-            yield self[i]
-
-    def shard(self, shard_id: int, num_shards: int):
-        return ScalarDataset(self.begin + shard_id, self.end, self.stride * num_shards)
-
-
-def test_can_batch_named_scalars():
-
-    devices = jax.devices()
-    model_axis_size = 1
-
-    mesh = Mesh(
-        np.array(devices).reshape(1, -1, model_axis_size),
-        (ResourceAxis.REPLICA, ResourceAxis.DATA, ResourceAxis.MODEL),
-    )
-    with mesh, hax.axis_mapping({"batch": (ResourceAxis.REPLICA, ResourceAxis.DATA)}):
-        dataset = ScalarDataset(0, 256, 1)
-        Batch = Axis("batch", len(devices))
-        loader = ShardedBatchLoader(dataset, mesh, Batch)
-
-        batches = list(itertools.islice(loader, 10))
-        for batch in batches:
-            check_sharded_consistency(batch, check_disjoint_indices_are_different=True)
-
-
-@skip_if_not_enough_devices(2)
-def test_structured_batches_model_axis_2():
-    devices = jax.devices()
-    model_axis_size = 2
-
-    mesh = Mesh(
-        np.array(devices).reshape(1, -1, model_axis_size),
-        (ResourceAxis.REPLICA, ResourceAxis.DATA, ResourceAxis.MODEL),
-    )
-    with mesh, hax.axis_mapping({"batch": (ResourceAxis.REPLICA, ResourceAxis.DATA)}):
-        seq_len = 128
-        dataset = StructuredDataset(seq_len, 0, 256, 1)
-        Batch = Axis("batch", len(devices))
-        loader = ShardedBatchLoader(dataset, mesh, Batch)
-
-        batches = list(itertools.islice(loader, 10))
-        for batch in batches:
-            check_sharded_consistency(batch, check_disjoint_indices_are_different=True)
-
-
-class StructuredDatasetWithNames(levanter.data.ShardableDataset):
-    def __init__(self, Height: Axis, Width: Axis, begin, end, stride):
-        self.Height = Height
-        self.Width = Width
-        self.begin = begin
-        self.end = end
-        self.stride = stride
-
-    def _gen_image(self, index):
-        image = (
-            np.arange(self.Height.size * self.Width.size, dtype=np.int32).reshape(self.Height.size, self.Width.size)
-            + index * 1000
-        )
-
-        return hax.named(image, (self.Height, self.Width))
-
-    def __getitem__(self, item):
-        return {
-            "input_ids": self._gen_image(item),
-            "labels": self._gen_image(item),
-            "extra": {
-                "input_ids": self._gen_image(item),
-                "mask": hax.arange(self.Height) + item * 1000,
-            },
-            "id": hax.named(jnp.array(item), ()),
-        }
-
-    def __iter__(self):
-        for i in range(self.begin, self.end, self.stride):
-            yield self[i]
-
-    def shard(self, shard_id: int, num_shards: int):
-        return StructuredDatasetWithNames(
-            self.Height, self.Width, self.begin + shard_id, self.end, self.stride * num_shards
-        )
-
-
-def test_structured_batches_model_axis_1_with_names():
-    devices = jax.devices()
-    model_axis_size = 1
-
-    mesh = Mesh(
-        np.array(devices).reshape(1, -1, model_axis_size),
-        (ResourceAxis.REPLICA, ResourceAxis.DATA, ResourceAxis.MODEL),
-    )
-    with mesh, hax.axis_mapping({"batch": (ResourceAxis.REPLICA, ResourceAxis.DATA)}):
-        Height = Axis("Height", 16)
-        Width = Axis("Width", 16)
-        dataset = StructuredDatasetWithNames(Height, Width, 0, 256, 1)
-        Batch = Axis("batch", len(devices))
-        loader = ShardedBatchLoader(dataset, mesh, Batch)
-
-        batches = list(itertools.islice(loader, 10))
-        for batch in batches:
-            check_sharded_consistency(batch, check_disjoint_indices_are_different=True)
-
-
-@skip_if_not_enough_devices(2)
-def test_structured_batches_model_axis_2_with_names():
-    devices = jax.devices()
-    model_axis_size = 2
-
-    mesh = Mesh(
-        np.array(devices).reshape(1, -1, model_axis_size),
-        (ResourceAxis.REPLICA, ResourceAxis.DATA, ResourceAxis.MODEL),
-    )
-    with mesh, hax.axis_mapping({"batch": (ResourceAxis.REPLICA, ResourceAxis.DATA)}):
-        Height = Axis("Height", 16)
-        Width = Axis("Width", 16)
-        dataset = StructuredDatasetWithNames(Height, Width, 0, 256, 1)
-        Batch = Axis("batch", len(devices))
-        loader = ShardedBatchLoader(dataset, mesh, Batch)
-
-        batches = list(itertools.islice(loader, 10))
-        for batch in batches:
-            check_sharded_consistency(batch, check_disjoint_indices_are_different=True)
-
-
-@skip_if_not_enough_devices(4)
-def test_structured_batches_model_axis_2_subsharded():
-    """This tests data loading if individual datums are sharded too"""
-    devices = jax.devices()
-    model_axis_size = 2
-
-    mesh = Mesh(
-        np.array(devices).reshape(1, -1, model_axis_size),
-        (ResourceAxis.REPLICA, ResourceAxis.DATA, ResourceAxis.MODEL),
-    )
-    Height = Axis("Height", 16)
-    Width = Axis("Width", 16)
-    with mesh, hax.axis_mapping({"batch": (ResourceAxis.REPLICA, ResourceAxis.DATA), Height.name: ResourceAxis.MODEL}):
-        dataset = StructuredDatasetWithNames(Height, Width, 0, 256, 1)
-        Batch = Axis("batch", len(devices))
-        loader = ShardedBatchLoader(dataset, mesh, Batch)
-
-        batches = list(itertools.islice(loader, 10))
-        for batch in batches:
-            check_sharded_consistency(batch, check_disjoint_indices_are_different=True)
-
-
-def test_sharded_loader_doesnt_throw_away_data():
-    devices = jax.devices()
-    model_axis_size = 1
-
-    mesh = Mesh(
-        np.array(devices).reshape(1, -1, model_axis_size),
-        (ResourceAxis.REPLICA, ResourceAxis.DATA, ResourceAxis.MODEL),
-    )
-    with mesh, hax.axis_mapping({"batch": (ResourceAxis.REPLICA, ResourceAxis.DATA)}):
-        dataset = ScalarDataset(0, 256, 1)
-        Batch = Axis("batch", len(devices))
-        loader = ShardedBatchLoader(dataset, mesh, Batch)
-
-        batches = list(itertools.islice(loader, 10))
-        dataset_examples = list(itertools.islice(dataset, 10 * Batch.size))
-
-        def unbatch_example(example):
-            return example.unbind("batch")
-
-        loader_examples = [ex for b in batches for ex in unbatch_example(b)]
-
-        for ex_d, ex_l in zip(dataset_examples, loader_examples):
-            assert jnp.all(ex_d.array == ex_l.array)
diff --git a/tests/test_shuffle_dataset.py b/tests/test_shuffle_dataset.py
deleted file mode 100644
index 226986d14..000000000
--- a/tests/test_shuffle_dataset.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from typing import Iterator
-
-from jax.random import PRNGKey
-
-from levanter.data import Dataset, ShuffleDataset
-
-
-class RangeDataset(Dataset[int]):
-    def __init__(self, start: int, end: int):
-        self.start = start
-        self.end = end
-
-    def __iter__(self) -> Iterator[int]:
-        yield from range(self.start, self.end)
-
-
-def test_shuffle_dataset():
-    dataset = RangeDataset(0, 100)
-    assert list(dataset) == list(range(100))
-
-    key = PRNGKey(0)
-    shuffle_dataset = ShuffleDataset(dataset, key, 10)
-
-    assert set(shuffle_dataset) == set(range(100))
-
-    assert list(shuffle_dataset) != list(range(100))
-
-    key2 = PRNGKey(2)
-    shuffle_dataset2 = ShuffleDataset(dataset, key2, 10)
-    assert list(shuffle_dataset2) != list(shuffle_dataset)
diff --git a/tests/test_text.py b/tests/test_text.py
index a9d407b44..a2645c1f9 100644
--- a/tests/test_text.py
+++ b/tests/test_text.py
@@ -1,12 +1,14 @@
 import tempfile
 
 import jax.numpy as jnp
+from transformers import AutoTokenizer
 
 import haliax as hax
 
-from levanter.data.text import LMDatasetConfig
+from levanter.data.text import BatchTokenizer, LMDatasetConfig
 from levanter.models.lm_model import LmExample
 from levanter.models.loss import next_token_loss
+from tests.test_utils import skip_if_hf_model_not_accessible
 
 
 def test_dont_blow_up_without_validation_set():
@@ -39,3 +41,29 @@ def test_lm_example_handles_ignore_id():
     no_ignore_loss = next_token_loss(Pos, Vocab, distr, tokens, loss_mask=ex_no_ignore.loss_mask)
 
     assert no_ignore_loss.item() >= ignored_loss.item() + 100 / Pos.size
+
+
+def test_merge_split_encodings():
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    # make this very short for testing
+
+    lorem = """Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."""
+
+    short_batch_tokenizer = BatchTokenizer(tokenizer, _workaround_len=len(lorem) // 3)
+    # force this
+    short_batch_tokenizer._needs_long_sequence_workaround = True
+
+    batch_tokenizer = BatchTokenizer(tokenizer, _workaround_len=50000)
+    batch = [lorem]
+
+    short_out = short_batch_tokenizer(batch)
+    reg_out = batch_tokenizer(batch)
+
+    assert short_out == reg_out
+
+
+@skip_if_hf_model_not_accessible("meta-llama/Llama-2-7b-hf")
+def test_llama_tokenizer_needs_long_sequence_workaround():
+    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+    batch_tokenizer = BatchTokenizer(tokenizer)
+    assert batch_tokenizer._needs_long_sequence_workaround
diff --git a/tests/test_tokenized_document_cache.py b/tests/test_tokenized_document_cache.py
deleted file mode 100644
index d3b452937..000000000
--- a/tests/test_tokenized_document_cache.py
+++ /dev/null
@@ -1,216 +0,0 @@
-import tempfile
-from typing import List, Sequence, TypeVar
-
-import pytest
-import ray
-from transformers import AutoTokenizer, BatchEncoding
-
-from levanter.data.shard_cache import build_or_load_cache
-from levanter.data.sharded_dataset import ShardedDataset, TextUrlDataset
-from levanter.data.text import TokenizedDocumentCache
-from levanter.utils.py_utils import logical_cpu_core_count
-from test_utils import IdentityProcessor, ShardsDataset, SingleShardDocumentSource, skip_in_ci
-
-
-tokenizer = AutoTokenizer.from_pretrained("gpt2")
-
-T = TypeVar("T")
-
-
-def setup_module(module):
-    ray_designated_cores = max(1, logical_cpu_core_count())
-    ray.init("local", num_cpus=ray_designated_cores)
-
-
-def teardown_module(module):
-    ray.shutdown()
-
-
-@pytest.mark.ray
-def test_index_empty_file():
-    with tempfile.TemporaryDirectory() as tmpdir:
-        empty_dataset = [""]
-        source = SingleShardDocumentSource(empty_dataset)
-        cache = TokenizedDocumentCache.build_or_load(
-            f"{tmpdir}/cache",
-            source,
-            tokenizer,
-            flatten_docs=True,
-            enforce_bos=False,
-            enforce_eos=False,
-            override_resources={"num_cpus": 1},
-        )
-
-        for chunk in cache:
-            assert chunk["input_ids"].size == 0
-
-
-@pytest.mark.ray
-def test_index_no_files():
-    with tempfile.TemporaryDirectory() as tmpdir:
-        empty_dataset = []
-        source = SingleShardDocumentSource(empty_dataset)
-        cache = TokenizedDocumentCache.build_or_load(
-            f"{tmpdir}/cache",
-            source,
-            tokenizer,
-            flatten_docs=True,
-            enforce_eos=False,
-            override_resources={"num_cpus": 1},
-        )
-
-        for chunk in cache:
-            pytest.fail("Should not have any chunks")
-
-
-@skip_in_ci
-@pytest.mark.ray
-def test_doc_cache_reproduces_data_one_batch_per_shard():
-    def doc_i(i: int):
-        return BatchEncoding(data=dict(input_ids=[list(range(10 * i, 10 * (i + 1)))]))
-
-    num_docs = 10
-    docs = [doc_i(j) for j in range(num_docs)]
-
-    class OneDocPerShardSource(ShardedDataset[T]):
-        def __init__(self, docs: List[T]):
-            self.docs = docs
-
-        @property
-        def shard_names(self) -> Sequence[str]:
-            return [str(i) for i in range(len(self.docs))]
-
-        def open_shard_at_row(self, shard_name: str, row: int):
-            if row != 0:
-                raise ValueError(f"Expected row 0, got {row}")
-
-            return [self.docs[int(shard_name)]]
-
-    source = OneDocPerShardSource(docs)
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        build_or_load_cache(f"{tmpdir}/cache", source, IdentityProcessor(), await_finished=True)
-        cache = TokenizedDocumentCache.load(f"{tmpdir}/cache", flatten_docs=False)
-
-        result = list(cache)
-
-        assert len(result) == num_docs
-        # sort the docs by input_ids b/c the order is not guaranteed
-        for i in range(len(result)):
-            as_listed = BatchEncoding(data={k: [vv.tolist() for vv in v] for k, v in result[i].items()})
-            assert as_listed == docs[i]
-
-
-@skip_in_ci
-@pytest.mark.ray
-@pytest.mark.parametrize("batch_size", list([1, 2, 3, 8]))
-def test_doc_cache_reproduces_data_multi_docs_per_batch_sharded(batch_size):
-    def batch_docs(doc_ids):
-        return BatchEncoding(data=dict(input_ids=[list(range(10 * i, 10 * (i + 1))) for i in doc_ids]))
-
-    num_docs = 10
-    batches = [batch_docs([j, j + 1]) for j in range(0, num_docs, batch_size)]
-
-    source = ShardsDataset([[b] for b in batches])
-    with tempfile.TemporaryDirectory() as tmpdir:
-        build_or_load_cache(f"{tmpdir}/cache", source, IdentityProcessor())
-        cache = TokenizedDocumentCache.load(f"{tmpdir}/cache", flatten_docs=True)
-
-        result = list(cache)
-
-        assert len(result) == len(batches)
-
-        def list_in_list(a, b):
-            """checks if a is a contiguous sublist of b"""
-            n = len(a)
-            return any((list(a) == list(b[i : i + n])) for i in range(len(b) - n + 1))
-
-        # all we can really assert is that every doc from docs is in the result as a sublist
-        for i in range(len(batches)):
-            doc_tokens = batches[i]["input_ids"][0]
-            found = False
-            for j in range(len(result)):
-                # check if the doc is in this result doc
-                found = list_in_list(doc_tokens, result[j]["input_ids"][0])
-                if found:
-                    break
-            assert found
-
-
-@skip_in_ci
-@pytest.mark.ray
-def test_doc_cache_sharding():
-    def doc_i(i: int):
-        return BatchEncoding(data=dict(input_ids=[list(range(10 * i, 10 * (i + 1)))]))
-
-    num_docs = 25
-    num_shards = 12
-    docs = [doc_i(j) for j in range(num_docs)]
-    # group into num_shards groups
-    doc_shards = [docs[i : i + num_docs // num_shards] for i in range(0, num_docs, num_docs // num_shards)]
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        source = ShardsDataset(doc_shards)
-        build_or_load_cache(f"{tmpdir}/cache", source, IdentityProcessor())
-
-        # must evenly divide num_shards
-        num_shards_rebuild = [1, 2, 3, 4, 6, 12]
-
-        for open_shards in num_shards_rebuild:
-            cache = TokenizedDocumentCache.load(f"{tmpdir}/cache", flatten_docs=False)
-            reconstructed = []
-
-            for shard_idx in range(0, open_shards):
-                # now we shard the cache
-                c = cache.shard(shard_idx, open_shards)
-                reconstructed.extend([d for b in c for d in _unbatch_encoding(b)])
-
-            assert len(reconstructed) == num_docs
-
-            # sort the docs by input_ids b/c the order is not guaranteed
-            reconstructed.sort(key=lambda x: x["input_ids"][0][0])  # extra [0] for batchiness
-            for i in range(len(reconstructed)):
-                as_listed = BatchEncoding(data={k: [vv.tolist() for vv in v] for k, v in reconstructed[i].items()})
-                assert as_listed == docs[i]
-
-
-def _unbatch_encoding(enc: BatchEncoding):
-    docs = []
-    for i in range(len(enc["input_ids"])):
-        docs.append(BatchEncoding(data={k: [v[i]] for k, v in enc.items()}))
-    return docs
-
-
-@pytest.mark.ray
-def test_cache_fails_with_different_tokenizer():
-    with tempfile.TemporaryDirectory() as tmpdir:
-        with open(f"{tmpdir}/data.txt", "w") as f:
-            f.write("")
-
-        dataset = TextUrlDataset(
-            [f"{tmpdir}/data.txt"],
-        )
-
-        tokenizer_a = AutoTokenizer.from_pretrained("microsoft/phi-2")
-        tokenizer_b = AutoTokenizer.from_pretrained("google/flan-t5-small")
-
-        TokenizedDocumentCache.build_or_load(
-            tmpdir,
-            dataset,
-            tokenizer=tokenizer_a,
-        )
-
-        # Loading with the original tokenizer should be fine.
-        TokenizedDocumentCache.build_or_load(
-            tmpdir,
-            dataset,
-            tokenizer=tokenizer_a,
-        )
-
-        # Loading with a different tokenizer should error out.
-        with pytest.raises(ValueError):
-            TokenizedDocumentCache.build_or_load(
-                tmpdir,
-                dataset,
-                tokenizer=tokenizer_b,
-            )
diff --git a/tests/test_tree_store.py b/tests/test_tree_store.py
new file mode 100644
index 000000000..e25ef7928
--- /dev/null
+++ b/tests/test_tree_store.py
@@ -0,0 +1,435 @@
+import tempfile
+from typing import Iterator, List, Sequence
+
+import numpy as np
+import pytest
+import tensorstore as ts
+
+from levanter.data import BatchProcessor, ShardedDataSource
+from levanter.data.utils import batched
+from levanter.store.tree_store import TreeStore
+
+
+class SimpleProcessor(BatchProcessor[Sequence[int], dict[str, np.ndarray]]):
+    def __init__(self, batch_size: int = 8):
+        self._batch_size = batch_size
+
+    def __call__(self, batch: Sequence[Sequence[int]]) -> Sequence[dict[str, Sequence[int]]]:
+        return [{"data": x} for x in batch]
+
+    @property
+    def output_exemplar(self) -> dict[str, Sequence[int]]:
+        return {"data": np.array([0], dtype=np.int64)}
+
+    @property
+    def batch_size(self) -> int:
+        return self._batch_size
+
+    @property
+    def num_cpus(self) -> int:
+        return 1
+
+
+class SimpleShardSource(ShardedDataSource[List[int]]):
+    def __init__(self, num_shards: int = 4):
+        self._num_shards = num_shards
+
+    @property
+    def shard_names(self) -> Sequence[str]:
+        return [f"shard_{i}" for i in range(self._num_shards)]
+
+    def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[List[int]]:
+        # parse the shard name to get the shard number
+        shard_num = int(shard_name.split("_")[1])
+        return ([shard_num * 10 + i] * 10 for i in range(row, 10))
+
+
+def test_tree_builder_with_processor():
+    with tempfile.TemporaryDirectory() as tempdir:
+        exemplar = {"data": np.array([0], dtype=np.int64)}
+
+        builder = TreeStore.open(exemplar, tempdir, mode="w")
+        processor = SimpleProcessor()
+        source = SimpleShardSource()
+
+        for batch in batched(source, processor.batch_size):
+            processed = processor(batch)
+            builder.extend(processed)
+
+        assert len(builder) == 40
+
+        for i, x in enumerate(builder):
+            assert len(x) == 1
+
+            np.testing.assert_array_equal(x["data"], np.asarray([i % 10 + i // 10 * 10] * 10))
+
+        assert i == 39
+
+        # now test random access
+        for i in range(40):
+            x = builder[i]
+            assert len(x) == 1
+            np.testing.assert_array_equal(x["data"], np.asarray([i % 10 + i // 10 * 10] * 10))
+
+        # double check columnar access
+        assert builder.tree["data"].data_size == 10 * 40
+
+
+def test_append_batch():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        exemplar = {"a": np.array([0], dtype=np.float64), "b": np.array([0], dtype=np.float64)}
+        builder = TreeStore.open(exemplar, tmpdir)
+
+        batch1 = [
+            {"a": np.array([1.0, 2.0]), "b": np.array([3.0, 4.0])},
+            {"a": np.array([5.0, 6.0]), "b": np.array([7.0, 8.0])},
+        ]
+        builder.extend(batch1)
+
+        assert len(builder) == 2
+
+        result1 = builder[0]
+        assert np.all(result1["a"] == np.array([1.0, 2.0]))
+        assert np.all(result1["b"] == np.array([3.0, 4.0]))
+
+        result2 = builder[1]
+        assert np.all(result2["a"] == np.array([5.0, 6.0]))
+        assert np.all(result2["b"] == np.array([7.0, 8.0]))
+
+
+def test_append_batch_different_shapes():
+    with tempfile.TemporaryDirectory() as tmpdir:
+
+        def _f32(x):
+            return np.asarray(x, dtype=np.float32)
+
+        exemplar = {"a": _f32([0]), "b": _f32([0])}
+        builder = TreeStore.open(exemplar, tmpdir)
+        batch1 = [
+            {"a": _f32([1.0, 2.0]), "b": _f32([3.0, 4.0])},
+            {"a": _f32([5.0, 6.0]), "b": _f32([7.0, 8.0])},
+        ]
+        builder.extend(batch1)
+
+        batch2 = [
+            {"a": _f32([9.0]), "b": _f32([10.0])},
+            {"a": _f32([11.0, 12.0, 13.0]), "b": _f32([14.0, 15.0, 16.0])},
+        ]
+        builder.extend(batch2)
+
+        assert len(builder) == 4
+
+        result3 = builder[2]
+        assert np.all(result3["a"] == np.array([9.0]))
+        assert np.all(result3["b"] == np.array([10.0]))
+
+        result4 = builder[3]
+        assert np.all(result4["a"] == np.array([11.0, 12.0, 13.0]))
+        assert np.all(result4["b"] == np.array([14.0, 15.0, 16.0]))
+
+
+def test_extend_batch_different_shapes():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        exemplar = {"a": np.array([0], dtype=np.float64), "b": np.array([0], dtype=np.float64)}
+        builder = TreeStore.open(exemplar, tmpdir)
+
+        batch1 = {"a": [np.array([1.0, 2.0]), np.array([5.0, 6.0])], "b": [np.array([3.0, 4.0]), np.array([7.0, 8.0])]}
+        builder.extend_with_batch(batch1)
+
+        batch2 = {
+            "a": [np.array([9.0]), np.array([11.0, 12.0, 13.0])],
+            "b": [np.array([10.0]), np.array([14.0, 15.0, 16.0])],
+        }
+        builder.extend_with_batch(batch2)
+
+        assert len(builder) == 4
+
+        result3 = builder[2]
+        assert np.all(result3["a"] == np.array([9.0]))
+        assert np.all(result3["b"] == np.array([10.0]))
+
+        result4 = builder[3]
+        assert np.all(result4["a"] == np.array([11.0, 12.0, 13.0]))
+        assert np.all(result4["b"] == np.array([14.0, 15.0, 16.0]))
+
+
+def test_len():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        exemplar = {"a": np.array([0], dtype=np.float64), "b": np.array([0], dtype=np.float64)}
+        builder = TreeStore.open(exemplar, tmpdir)
+
+        assert len(builder) == 0
+
+        batch = [
+            {"a": np.array([1.0, 2.0]), "b": np.array([3.0, 4.0])},
+            {"a": np.array([5.0, 6.0]), "b": np.array([7.0, 8.0])},
+        ]
+        builder.extend(batch)
+
+        assert len(builder) == 2
+
+
+def test_getitem():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        exemplar = {"a": np.array([0], dtype=np.float64), "b": np.array([0], dtype=np.float64)}
+        builder = TreeStore.open(exemplar, tmpdir)
+
+        batch = [
+            {"a": np.array([1.0, 2.0]), "b": np.array([3.0, 4.0])},
+            {"a": np.array([5.0, 6.0]), "b": np.array([7.0, 8.0])},
+        ]
+        builder.extend(batch)
+
+        result = builder[0]
+        assert np.all(result["a"] == np.array([1.0, 2.0]))
+        assert np.all(result["b"] == np.array([3.0, 4.0]))
+
+        result = builder[1]
+        assert np.all(result["a"] == np.array([5.0, 6.0]))
+        assert np.all(result["b"] == np.array([7.0, 8.0]))
+
+        # test slice
+        # result = builder[0:2]
+        # assert isinstance(result["a"], JaggedArray)
+        # assert isinstance(result["b"], JaggedArray)
+
+
+def test_getitem_out_of_bounds():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        exemplar = {"a": np.array([0], dtype=np.float64), "b": np.array([0], dtype=np.float64)}
+        builder = TreeStore.open(exemplar, tmpdir)
+
+        batch = [
+            {"a": np.array([1.0, 2.0]), "b": np.array([3.0, 4.0])},
+            {"a": np.array([5.0, 6.0]), "b": np.array([7.0, 8.0])},
+        ]
+        builder.extend(batch)
+
+        with pytest.raises(IndexError):
+            builder[2]
+
+
+def test_iter():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        exemplar = {"a": np.array([0], dtype=np.float64), "b": np.array([0], dtype=np.float64)}
+        builder = TreeStore.open(exemplar, tmpdir)
+
+        batch = [
+            {"a": np.array([1.0, 2.0]), "b": np.array([3.0, 4.0])},
+            {"a": np.array([5.0, 6.0]), "b": np.array([7.0, 8.0])},
+        ]
+        builder.extend(batch)
+
+        for i, result in enumerate(builder):
+            if i == 0:
+                assert np.all(result["a"] == np.array([1.0, 2.0]))
+                assert np.all(result["b"] == np.array([3.0, 4.0]))
+            elif i == 1:
+                assert np.all(result["a"] == np.array([5.0, 6.0]))
+                assert np.all(result["b"] == np.array([7.0, 8.0]))
+            else:
+                pytest.fail("Unexpected index")
+
+
+def test_reading_from_written():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        exemplar = {"a": np.array([0], dtype=np.float64), "b": np.array([0], dtype=np.float64)}
+        builder = TreeStore.open(exemplar, tmpdir, mode="w")
+
+        batch = [
+            {"a": np.array([1.0, 2.0]), "b": np.array([3.0, 4.0])},
+            {"a": np.array([5.0, 6.0]), "b": np.array([7.0, 8.0])},
+        ]
+        builder.extend(batch)
+
+        del builder
+
+        builder2 = TreeStore.open(exemplar, tmpdir, mode="r")
+
+        for i, result in enumerate(builder2):
+            if i == 0:
+                assert np.all(result["a"] == np.array([1.0, 2.0]))
+                assert np.all(result["b"] == np.array([3.0, 4.0]))
+            elif i == 1:
+                assert np.all(result["a"] == np.array([5.0, 6.0]))
+                assert np.all(result["b"] == np.array([7.0, 8.0]))
+            else:
+                pytest.fail("Unexpected index")
+
+
+def test_resolve_changed_cache_size():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        exemplar = {"a": np.array([0], dtype=np.float64), "b": np.array([0], dtype=np.float64)}
+        builder = TreeStore.open(exemplar, tmpdir, mode="w")
+        follower = TreeStore.open(exemplar, tmpdir, mode="r")
+
+        batch = [
+            {"a": np.array([1.0, 2.0]), "b": np.array([3.0, 4.0])},
+            {"a": np.array([5.0, 6.0]), "b": np.array([7.0, 8.0])},
+        ]
+        builder.extend(batch)
+
+        follower = follower.reload()
+        follower2 = TreeStore.open(exemplar, tmpdir, mode="r")
+
+        assert len(follower2) == 2
+        assert len(follower) == 2
+
+        builder.extend(batch)
+        follower = follower.reload()
+
+        assert len(follower) == 4
+
+
+# this test mostly exists to help me remember the API
+
+
+def test_simple_resize_bounds():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        store1 = ts.open(
+            {
+                "driver": "zarr",
+                "kvstore": {
+                    "driver": "file",
+                    "path": tmpdir,
+                },
+            },
+            create=True,
+            dtype=ts.int32,
+            shape=[1000, 2000, 3000],
+            chunk_layout=ts.ChunkLayout(inner_order=[2, 1, 0]),
+        ).result()
+
+        store2 = ts.open(
+            {
+                "driver": "zarr",
+                "kvstore": {
+                    "driver": "file",
+                    "path": tmpdir,
+                },
+            },
+            dtype=ts.int32,
+        ).result()
+
+        assert store2.shape == (1000, 2000, 3000)
+        assert store2.chunk_layout.inner_order == (2, 1, 0)
+
+        store1 = store1.resize(exclusive_max=[2000, 3000, 4000]).result()
+
+        assert store1.shape == (2000, 3000, 4000)
+
+        # store2 = store2[ts.d[0].mark_bounds_implicit[True]].resolve().result()
+        spec = store2.spec(retain_context=True, minimal_spec=True)
+        # spec.update(transform={})
+        store2 = ts.open(spec).result()
+
+        # store2 = store2.resolve(fix_resizable_bounds=False).result()
+
+        assert store2.shape == (2000, 3000, 4000)  # nope?
+
+
+@pytest.mark.asyncio
+async def test_get_batch_single_item():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        exemplar = {"a": np.array([0], dtype=np.float64), "b": np.array([0], dtype=np.float64)}
+        builder = TreeStore.open(exemplar, tmpdir)
+
+        batch1 = [
+            {"a": np.array([1.0, 2.0]), "b": np.array([3.0, 4.0])},
+            {"a": np.array([5.0, 6.0]), "b": np.array([7.0, 8.0])},
+        ]
+        builder.extend(batch1)
+
+        # Retrieve a single item using get_batch
+        batch = await builder.get_batch([0])
+        result = batch[0]
+
+        expected_data = builder[0]
+        assert np.array_equal(result["a"], expected_data["a"])
+        assert np.array_equal(result["b"], expected_data["b"])
+
+
+@pytest.mark.asyncio
+async def test_get_batch_multiple_items():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        exemplar = {"a": np.array([0], dtype=np.float64), "b": np.array([0], dtype=np.float64)}
+        builder = TreeStore.open(exemplar, tmpdir)
+
+        batch1 = [
+            {"a": np.array([1.0, 2.0]), "b": np.array([3.0, 4.0])},
+            {"a": np.array([5.0, 6.0]), "b": np.array([7.0, 8.0])},
+            {"a": np.array([9.0, 10.0]), "b": np.array([11.0, 12.0])},
+        ]
+        builder.extend(batch1)
+
+        # Retrieve multiple items using get_batch
+        indices = [0, 2]
+        batch = await builder.get_batch(indices)
+
+        for idx, result in zip(indices, batch):
+            expected_data = builder[idx]
+            assert np.array_equal(result["a"], expected_data["a"])
+            assert np.array_equal(result["b"], expected_data["b"])
+
+
+@pytest.mark.asyncio
+async def test_get_batch_out_of_order():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        exemplar = {"a": np.array([0], dtype=np.float64), "b": np.array([0], dtype=np.float64)}
+        builder = TreeStore.open(exemplar, tmpdir)
+
+        batch1 = [
+            {"a": np.array([1.0, 2.0]), "b": np.array([3.0, 4.0])},
+            {"a": np.array([5.0, 6.0]), "b": np.array([7.0, 8.0])},
+            {"a": np.array([9.0, 10.0]), "b": np.array([11.0, 12.0])},
+        ]
+        builder.extend(batch1)
+
+        # Retrieve items out of order using get_batch
+        indices = [2, 0, 1]
+        batch = await builder.get_batch(indices)
+
+        for idx, result in zip(indices, batch):
+            expected_data = builder[idx]
+            assert np.array_equal(result["a"], expected_data["a"])
+            assert np.array_equal(result["b"], expected_data["b"])
+
+
+@pytest.mark.asyncio
+async def test_get_batch_with_shapes():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        exemplar = {"a": np.array([[0]], dtype=np.float64), "b": np.array([[0]], dtype=np.float64)}
+        builder = TreeStore.open(exemplar, tmpdir)
+
+        batch1 = [
+            {"a": np.array([[1.0, 2.0], [3.0, 4.0]]), "b": np.array([[5.0, 6.0], [7.0, 8.0]])},
+            {"a": np.array([[9.0, 10.0], [11.0, 12.0]]), "b": np.array([[13.0, 14.0], [15.0, 16.0]])},
+        ]
+        builder.extend(batch1)
+
+        # Retrieve multiple items using get_batch
+        indices = [0, 1]
+        batch = await builder.get_batch(indices)
+
+        for idx, result in zip(indices, batch):
+            expected_data = builder[idx]
+            assert np.array_equal(result["a"], expected_data["a"])
+            assert np.array_equal(result["b"], expected_data["b"])
+
+
+@pytest.mark.asyncio
+async def test_get_batch_empty():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        exemplar = {"a": np.array([0], dtype=np.float64), "b": np.array([0], dtype=np.float64)}
+        builder = TreeStore.open(exemplar, tmpdir)
+
+        batch1 = [
+            {"a": np.array([1.0, 2.0]), "b": np.array([3.0, 4.0])},
+            {"a": np.array([5.0, 6.0]), "b": np.array([7.0, 8.0])},
+        ]
+        builder.extend(batch1)
+
+        # Retrieve an empty batch
+        batch = await builder.get_batch([])
+
+        assert batch == []
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 53042826c..1bf03b624 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -17,7 +17,7 @@
 
 from levanter.checkpoint import _get_fs_and_plain_path
 from levanter.data._preprocessor import BatchProcessor
-from levanter.data.sharded_dataset import ShardedDataset
+from levanter.data.sharded_datasource import ShardedDataSource
 from levanter.data.text import _stack_batch_encodings
 from levanter.models.attention import AttentionMask
 
@@ -193,17 +193,21 @@ def decorator(fn):
     return pytest.mark.skipif("CI" in os.environ, reason="skipped in CI")(fn_or_msg)
 
 
-class IdentityProcessor(BatchProcessor[BatchEncoding]):
+class IdentityProcessor(BatchProcessor[BatchEncoding, BatchEncoding]):
     def __call__(self, batch: Sequence[BatchEncoding]) -> BatchEncoding:
         stacked = reduce(_stack_batch_encodings, batch)
         return stacked
 
+    @property
+    def output_exemplar(self):
+        return BatchEncoding({})
+
     @property
     def num_cpus(self) -> int:
         return 0
 
 
-class ShardsDataset(ShardedDataset[T]):
+class ShardsDataSource(ShardedDataSource[T]):
     def __init__(self, docs: List[List[T]]):
         self.docs = docs
 
@@ -215,7 +219,7 @@ def open_shard_at_row(self, shard_name: str, row: int):
         return self.docs[int(shard_name)][row:]
 
 
-class SingleShardDocumentSource(ShardedDataset[T]):
+class SingleShardDocumentSource(ShardedDataSource[T]):
     def __init__(self, docs: List[T]):
         self.docs = docs
 
diff --git a/tests/tiny_test_corpus.py b/tests/tiny_test_corpus.py
index 5cd0e8a70..91597c137 100644
--- a/tests/tiny_test_corpus.py
+++ b/tests/tiny_test_corpus.py
@@ -2,10 +2,11 @@
 import os
 
 import numpy
+import numpy as np
 
 from levanter.data.audio import AudioIODatasetConfig
-from levanter.data.shard_cache import ShardCache
 from levanter.data.text import LMDatasetConfig
+from levanter.store.cache import TreeCache
 
 
 def _write_tiny_corpus(path):
@@ -43,17 +44,24 @@ def tiny_asr_corpus_config(path):
 
 def construct_small_data_cache(
     path, num_shards=8, chunk_size=512, doc_len=128, vocab_size=1024
-) -> tuple[LMDatasetConfig, dict[str, ShardCache]]:
-    from levanter.data.shard_cache import SerialCacheWriter
+) -> tuple[LMDatasetConfig, dict[str, TreeCache]]:
+    from levanter.store.cache import SerialCacheWriter
 
     rng = numpy.random.default_rng(0)
 
-    caches = {}
+    caches: dict[str, TreeCache] = {}
+
+    exemplar = {"input_ids": numpy.zeros((doc_len,), dtype=numpy.int32)}
 
     for split in ["train", "validation"]:
-        with SerialCacheWriter(f"{path}/cache/{split}", chunk_size) as writer:
+        with SerialCacheWriter(f"{path}/cache/{split}", exemplar) as writer:
             for shard in range(num_shards):
-                writer.write_batch({"input_ids": rng.integers(0, vocab_size, size=(chunk_size, doc_len))})
+                writer.write_batch(
+                    [
+                        {"input_ids": rng.integers(0, vocab_size, size=(doc_len,), dtype=np.int32)}
+                        for _ in range(chunk_size)
+                    ]
+                )
         caches[split] = writer.result()
 
     config = LMDatasetConfig(

From 944a19f061f01736c0fc5742f8a9b8db3161efe7 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Thu, 5 Sep 2024 13:16:35 -0700
Subject: [PATCH 45/94] unpin ray (#718)

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8712d16a3..e1bdedf3a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,7 +44,7 @@ dependencies = [
     "matplotlib>=3.7.0",
     "tblib>=1.7.0,<4.0.0",
     "dataclasses-json~=0.6.4",
-    "ray[default]==2.35.0",
+    "ray[default]>=2.34.0",
     "pydantic<3",
     "rich~=13.0",
     "filelock~=3.13",

From f13cfde11794385e3002df0990c7d0bd27d9fad1 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Thu, 5 Sep 2024 14:42:16 -0700
Subject: [PATCH 46/94] bump equinox

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7ba0b4c32..1679e8607 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,7 @@ classifiers = [
 ]
 dependencies = [
     "haliax>=1.4.dev307",
-    "equinox>=0.11.4",
+    "equinox>=0.11.5",
     "jaxtyping>=0.2.20",
     "tokenizers>=0.15.2",
     "transformers>=4.41.2",

From 8d3dfe01564841eaff2e3f73c86fe23f9e41f945 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Fri, 6 Sep 2024 12:12:09 -0700
Subject: [PATCH 47/94] wip

---
 config/llama_22b_with_dclm.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/config/llama_22b_with_dclm.yaml b/config/llama_22b_with_dclm.yaml
index 7dec40026..8e01c5d84 100644
--- a/config/llama_22b_with_dclm.yaml
+++ b/config/llama_22b_with_dclm.yaml
@@ -1,7 +1,7 @@
 data: !include data/dclm_gpt_neo.yaml
 model:  # 22B class model
   type: llama
-  seq_len: 2048
+  seq_len: 4096
   hidden_dim: 6144
   intermediate_dim: 16384
   num_layers: 56
@@ -16,7 +16,7 @@ trainer:
     tags: ["dclm", "22B", "llama"]
 
   mp: p=f32,c=bfloat16
-  train_batch_size: 2048
+  train_batch_size: 512
   num_train_steps: 100000
   steps_per_eval: 1000
   tensor_parallel_axes: ["mlp", "heads"]

From 8ecb7ea40c4753940333f1f04a5be1a7f3418cf3 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Fri, 6 Sep 2024 15:56:31 -0700
Subject: [PATCH 48/94] 768

---
 config/llama_22b_with_dclm.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/llama_22b_with_dclm.yaml b/config/llama_22b_with_dclm.yaml
index 8e01c5d84..2f593021f 100644
--- a/config/llama_22b_with_dclm.yaml
+++ b/config/llama_22b_with_dclm.yaml
@@ -16,7 +16,7 @@ trainer:
     tags: ["dclm", "22B", "llama"]
 
   mp: p=f32,c=bfloat16
-  train_batch_size: 512
+  train_batch_size: 768
   num_train_steps: 100000
   steps_per_eval: 1000
   tensor_parallel_axes: ["mlp", "heads"]

From 9ba6b2012da3c021f69239ff7fd8ca3078b8781d Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 10 Sep 2024 10:09:59 -0700
Subject: [PATCH 49/94] Update gcsfs requirement from <2024.7,>=2024.2 to
 >=2024.2,<2024.10 (#723)

Updates the requirements on [gcsfs](https://github.com/fsspec/gcsfs) to permit the latest version.
- [Commits](https://github.com/fsspec/gcsfs/compare/2024.2.0...2024.9.0post1)

---
updated-dependencies:
- dependency-name: gcsfs
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index e1bdedf3a..d04d858a2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,7 +33,7 @@ dependencies = [
     "pyarrow>=11.0.0",
     "zstandard>=0.20.0",
     "datasets~=2.18",
-    "gcsfs>=2024.2,<2024.7",
+    "gcsfs>=2024.2,<2024.10",
     "braceexpand>=0.1.7",
     "jmp>=0.0.3",
     "fsspec[http]>=2024.2,<2024.7",

From 78da9028315e9d4e37223a85a9fa1bce8d8e626e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 10 Sep 2024 10:10:12 -0700
Subject: [PATCH 50/94] Update fsspec[http] requirement (#722)

Updates the requirements on [fsspec[http]](https://github.com/fsspec/filesystem_spec) to permit the latest version.
- [Commits](https://github.com/fsspec/filesystem_spec/compare/2024.2.0...2024.9.0)

---
updated-dependencies:
- dependency-name: fsspec[http]
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index d04d858a2..b5ff670ba 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,7 @@ dependencies = [
     "gcsfs>=2024.2,<2024.10",
     "braceexpand>=0.1.7",
     "jmp>=0.0.3",
-    "fsspec[http]>=2024.2,<2024.7",
+    "fsspec[http]>=2024.2,<2024.10",
     "tensorstore>=0.1.62",
     "pytimeparse>=1.1.8",
     "humanfriendly==10.0",

From 5b685c364e8502d186e4a5877b37eb1c41848c80 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 10 Sep 2024 10:10:38 -0700
Subject: [PATCH 51/94] Bump equinox from 0.11.4 to 0.11.5 (#721)

Bumps [equinox](https://github.com/patrick-kidger/equinox) from 0.11.4 to 0.11.5.
- [Release notes](https://github.com/patrick-kidger/equinox/releases)
- [Commits](https://github.com/patrick-kidger/equinox/compare/v0.11.4...v0.11.5)

---
updated-dependencies:
- dependency-name: equinox
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index b5ff670ba..e390462da 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,7 @@ classifiers = [
 ]
 dependencies = [
     "haliax>=1.4.dev307",
-    "equinox==0.11.4",
+    "equinox==0.11.5",
     "jaxtyping>=0.2.20",
     "tokenizers>=0.15.2",
     "transformers>=4.41.2",

From a91ef813f943e7d80fe53998d2ea311ce0a17bae Mon Sep 17 00:00:00 2001
From: Jason Wang <blahblahj.wsy@gmail.com>
Date: Tue, 10 Sep 2024 18:30:50 -0700
Subject: [PATCH 52/94] fix extra context docker build bug (#724)

---
 docker/tpu/Dockerfile.incremental | 2 +-
 src/levanter/infra/docker.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental
index f0369736c..64c14b4c9 100644
--- a/docker/tpu/Dockerfile.incremental
+++ b/docker/tpu/Dockerfile.incremental
@@ -22,4 +22,4 @@ ADD . /opt/levanter
 
 # Add $EXTRA_CTX to the same location as in local machine.
 # it's already in the image, so we don't need to copy it. just move it if we set EXTRA_CTX
-RUN if [ -f ".mnt" ]; then mkdir -p $(dirname $EXTRA_CTX) && mv .mnt $EXTRA_CTX; fi
+RUN if [ -f ".mnt" ] || [ -d ".mnt" ]; then mkdir -p $(dirname $EXTRA_CTX) && mv .mnt $EXTRA_CTX; fi
diff --git a/src/levanter/infra/docker.py b/src/levanter/infra/docker.py
index 2f8052f87..63a51ae2f 100644
--- a/src/levanter/infra/docker.py
+++ b/src/levanter/infra/docker.py
@@ -159,7 +159,7 @@ def copy_extra_ctx(extra_ctx):
         mount_dst = Path(".mnt")
         _cp(extra_ctx, mount_dst)
         try:
-            yield mount_dst
+            yield extra_ctx
         finally:
             _rm(mount_dst)
     else:

From 5c185573fce7c16769add7758f95bbfc5fbeb84b Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Wed, 11 Sep 2024 20:46:17 -0700
Subject: [PATCH 53/94] Fix eqx (#725)

* don't hit ray if we don't need to in TreeCache

* make ray exit quieter

* reduce log spam of wandb

* .aider ignore

* sigh

* make BackgroundIterable work not in the background

* fix regression caused by new Equinox
---
 .gitignore                                |  1 +
 pyproject.toml                            |  4 +-
 src/levanter/distributed.py               |  2 +-
 src/levanter/store/cache.py               |  8 +++-
 src/levanter/tracker/wandb.py             | 10 +++--
 src/levanter/utils/background_iterable.py | 48 +++++++++++++++------
 src/levanter/utils/thread_utils.py        | 29 +++++++++++++
 tests/test_background_iterable.py         | 51 ++++++++++++++---------
 8 files changed, 114 insertions(+), 39 deletions(-)

diff --git a/.gitignore b/.gitignore
index 835da2048..8a6acca53 100644
--- a/.gitignore
+++ b/.gitignore
@@ -152,3 +152,4 @@ ledger.json
 
 # local execution commands
 local_*.sh
+.aider*
diff --git a/pyproject.toml b/pyproject.toml
index e390462da..de85d287e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,7 @@ classifiers = [
 ]
 dependencies = [
     "haliax>=1.4.dev307",
-    "equinox==0.11.5",
+    "equinox==0.11.3",
     "jaxtyping>=0.2.20",
     "tokenizers>=0.15.2",
     "transformers>=4.41.2",
@@ -37,7 +37,7 @@ dependencies = [
     "braceexpand>=0.1.7",
     "jmp>=0.0.3",
     "fsspec[http]>=2024.2,<2024.10",
-    "tensorstore>=0.1.62",
+    "tensorstore==0.1.64",
     "pytimeparse>=1.1.8",
     "humanfriendly==10.0",
     "safetensors[numpy]~=0.4.2",
diff --git a/src/levanter/distributed.py b/src/levanter/distributed.py
index ea0bbb3c7..112409743 100644
--- a/src/levanter/distributed.py
+++ b/src/levanter/distributed.py
@@ -252,7 +252,7 @@ def _munge_address_port(address: str):
                             logger.info(f"Successfully started ray head on port {ray_port}.")
 
                         # install an atexit handler to kill the head when we exit
-                        atexit.register(lambda: os.system("ray stop -g 10 --force"))
+                        atexit.register(lambda: os.system("ray stop -g 10 --force &> /dev/null"))
                     elif start_workers:
                         logger.info(
                             f"Starting ray worker and connecting to {address}. We are process {jax.process_index()}."
diff --git a/src/levanter/store/cache.py b/src/levanter/store/cache.py
index 85b612f91..6db7693fe 100644
--- a/src/levanter/store/cache.py
+++ b/src/levanter/store/cache.py
@@ -54,7 +54,8 @@
 LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 
 # TODO: should probably do this in terms of bytes
-MIN_ITEMS_TO_WRITE = 8192
+# this is kinda silly, but the bigger the better.
+MIN_ITEMS_TO_WRITE = 32 * 1024
 MAX_TIME_BETWEEN_WRITES = 100.0
 
 
@@ -883,6 +884,7 @@ def __init__(
         self.logger = pylogging.getLogger(f"TreeCache.{name}")
         self._store_future: threading_Future[TreeStore] = threading_Future()
         self._stop = False
+        # assert _broker is None
 
         if self._broker is not None:
             self._monitor_thread = threading.Thread(target=self._monitor_metrics, daemon=True)
@@ -1078,11 +1080,15 @@ async def _get_start_stops_async(self, slice):
         return start, step, stop
 
     def await_finished(self, timeout: Optional[float] = None):
+        if self._broker is None:
+            return
         x = ray.get(self.finished_sentinel(), timeout=timeout)
         self._attempt_to_load_store()
         return x
 
     async def finished(self):
+        if self._broker is None:
+            return
         x = await self.finished_sentinel()
         # TODO: make an async version of this
         self._attempt_to_load_store()
diff --git a/src/levanter/tracker/wandb.py b/src/levanter/tracker/wandb.py
index 1e95c0d3a..18f0251ec 100644
--- a/src/levanter/tracker/wandb.py
+++ b/src/levanter/tracker/wandb.py
@@ -45,6 +45,8 @@ def __init__(self, run: Optional[WandbRun]):
         else:
             self.run = run
 
+        self._last_warning_step = -500
+
     def log_hyperparameters(self, hparams: dict[str, Any]):
         self.run.config.update(hparams, allow_val_change=True)
 
@@ -53,9 +55,11 @@ def log(self, metrics: dict[str, Any], *, step, commit=None):
             step = self.run.step
 
         if step < self.run.step:
-            logger.warning(
-                f"Step {step} is less than the current step {self.run.step}. Cowardly refusing to log metrics."
-            )
+            if step - self._last_warning_step > 500:
+                logger.warning(
+                    f"Step {step} is less than the current step {self.run.step}. Cowardly refusing to log metrics."
+                )
+                self._last_warning_step = step
             return
 
         step = int(step)
diff --git a/src/levanter/utils/background_iterable.py b/src/levanter/utils/background_iterable.py
index 84c5a7789..4318b3f9b 100644
--- a/src/levanter/utils/background_iterable.py
+++ b/src/levanter/utils/background_iterable.py
@@ -6,6 +6,8 @@
 
 import tblib
 
+from levanter.utils.thread_utils import AsyncIteratorWrapper
+
 
 Ex = TypeVar("Ex", covariant=True)
 
@@ -36,30 +38,52 @@ def __init__(self, producer_fn: Callable[[], Union[Iterator[Ex], AsyncIterator[E
         self.max_capacity = max_capacity
         self._producer_fn = producer_fn
         self._stop_event = threading.Event()
-        self.q: queue.Queue = queue.Queue(self.max_capacity or 0)
-        self.thread = threading.Thread(target=self._fill_queue_with_batches)
-        self.thread.daemon = True
-        self.thread.start()
+
+        if self.max_capacity is None or self.max_capacity >= 0:
+            self.q: queue.Queue = queue.Queue(self.max_capacity or 0)
+            self.thread: Optional[threading.Thread] = threading.Thread(target=self._fill_queue_with_batches)
+            self.thread.daemon = True
+            self.thread.start()
+        else:
+            # No background thread; consume items on demand
+            self.thread = None
+            self.iterator = self._producer_fn()
+            if not isinstance(self.iterator, Iterator):
+                self.iterator = AsyncIteratorWrapper(self.iterator)
 
     def __iter__(self):
         return self
 
     def __next__(self):
-        while not self._stop_event.is_set():
-            batch = self.q.get()
-            if batch is _SENTINEL:
+        if self._stop_event.is_set():
+            raise StopIteration
+        if self.thread is not None:
+            while not self._stop_event.is_set():
+                batch = self.q.get()
+                if batch is _SENTINEL:
+                    raise StopIteration
+                elif isinstance(batch, _ExceptionWrapper):
+                    batch.reraise()
+                return batch
+        else:
+            # Consume the iterator directly on demand
+            try:
+                return next(self.iterator)
+            except StopIteration:
+                raise
+            except StopAsyncIteration:
                 raise StopIteration
-            elif isinstance(batch, _ExceptionWrapper):
-                batch.reraise()
-            return batch
-
+            except Exception as e:
+                raise e
         raise StopIteration
 
     def __del__(self):
         self.stop()
 
-    def stop(self):
+    def stop(self, wait: bool = True):
         self._stop_event.set()
+        if self.thread is not None and wait:
+            self.thread.join()
 
     def _fill_queue_with_batches(self):
         try:
diff --git a/src/levanter/utils/thread_utils.py b/src/levanter/utils/thread_utils.py
index 9c6e2ef36..0b4abcdaf 100644
--- a/src/levanter/utils/thread_utils.py
+++ b/src/levanter/utils/thread_utils.py
@@ -1,5 +1,7 @@
 import asyncio
+import threading
 from concurrent.futures import ThreadPoolExecutor
+from typing import Iterator
 
 
 # Create a ThreadPoolExecutor
@@ -26,3 +28,30 @@ def future_from_value(value):
     future = asyncio.Future()
     future.set_result(value)
     return future
+
+
+class AsyncIteratorWrapper(Iterator):
+    def __init__(self, async_iter):
+        self.async_iter = async_iter
+        self.loop = asyncio.new_event_loop()
+        self.executor = ThreadPoolExecutor(max_workers=1)
+        self.thread = threading.Thread(target=self._run_loop, daemon=True)
+        self.thread.start()
+
+    def _run_loop(self):
+        asyncio.set_event_loop(self.loop)
+        self.loop.run_forever()
+
+    def _run_async_task(self, coro):
+        return asyncio.run_coroutine_threadsafe(coro, self.loop).result()
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        try:
+            return self._run_async_task(self.async_iter.__anext__())
+        except StopAsyncIteration:
+            self.loop.call_soon_threadsafe(self.loop.stop)
+            self.thread.join()
+            raise StopIteration
diff --git a/tests/test_background_iterable.py b/tests/test_background_iterable.py
index 0da8d6ea6..603b01743 100644
--- a/tests/test_background_iterable.py
+++ b/tests/test_background_iterable.py
@@ -5,9 +5,10 @@
 from levanter.utils.background_iterable import BackgroundIterable
 
 
-def test_reentrancy():
+@pytest.mark.parametrize("max_capacity", [-1, None, 10])
+def test_reentrancy(max_capacity):
     test_data = list(range(1, 101))
-    background_iterable = BackgroundIterable(lambda: iter(test_data), max_capacity=10)
+    background_iterable = BackgroundIterable(lambda: iter(test_data), max_capacity=max_capacity)
 
     iter1 = iter(background_iterable)
     iter2 = iter(background_iterable)
@@ -19,9 +20,10 @@ def test_reentrancy():
     assert data1 == test_data
 
 
-def test_empty_iteration():
+@pytest.mark.parametrize("max_capacity", [-1, None, 10])
+def test_empty_iteration(max_capacity):
     # Create a BackgroundIterable instance with an empty producer function
-    background_iterable = BackgroundIterable(lambda: iter([]), max_capacity=10)
+    background_iterable = BackgroundIterable(lambda: iter([]), max_capacity=max_capacity)
 
     # Convert the iterator to a list for comparison
     data = list(background_iterable)
@@ -30,13 +32,14 @@ def test_empty_iteration():
     assert data == []
 
 
-def test_exception_handling():
+@pytest.mark.parametrize("max_capacity", [-1, None, 10])
+def test_exception_handling(max_capacity):
     # Create a producer function that raises an exception
     def producer_with_exception():
         raise ValueError("Something went wrong!")
 
     # Create a BackgroundIterable instance with the producer function that raises an exception
-    background_iterable = BackgroundIterable(producer_with_exception, max_capacity=10)
+    background_iterable = BackgroundIterable(producer_with_exception, max_capacity=max_capacity)
 
     # Iterate over the BackgroundIterable and handle the raised exception
     with pytest.raises(ValueError):
@@ -44,13 +47,14 @@ def producer_with_exception():
             pass
 
 
-def test_stop_event():
+@pytest.mark.parametrize("max_capacity", [-1, None, 10])
+def test_stop_event(max_capacity):
     def ongoing_process():
         while True:
             for item in range(1, 101):
                 yield item
 
-    background_iterable = BackgroundIterable(ongoing_process, max_capacity=10)
+    background_iterable = BackgroundIterable(ongoing_process, max_capacity=max_capacity)
 
     iter1 = iter(background_iterable)
 
@@ -67,13 +71,15 @@ def ongoing_process():
 
 
 @pytest.mark.asyncio
-async def test_async_reentrancy():
+@pytest.mark.parametrize("max_capacity", [-1, None, 10])
+async def test_async_reentrancy(max_capacity):
     async def async_producer():
         for i in range(1, 101):
             yield i
-            await asyncio.sleep(0.01)
+            if i % 10 == 0:
+                await asyncio.sleep(0.001)
 
-    background_iterable = BackgroundIterable(async_producer, max_capacity=10)
+    background_iterable = BackgroundIterable(async_producer, max_capacity=max_capacity)
 
     iter1 = iter(background_iterable)
     iter2 = iter(background_iterable)
@@ -86,12 +92,13 @@ async def async_producer():
 
 
 @pytest.mark.asyncio
-async def test_async_empty_iteration():
+@pytest.mark.parametrize("max_capacity", [-1, None, 10])
+async def test_async_empty_iteration(max_capacity):
     async def async_producer():
         if False:
             yield
 
-    background_iterable = BackgroundIterable(async_producer, max_capacity=10)
+    background_iterable = BackgroundIterable(async_producer, max_capacity=max_capacity)
 
     data = list(background_iterable)
 
@@ -99,12 +106,13 @@ async def async_producer():
 
 
 @pytest.mark.asyncio
-async def test_async_exception_handling():
+@pytest.mark.parametrize("max_capacity", [-1, None, 10])
+async def test_async_exception_handling(max_capacity):
     async def async_producer_with_exception():
         raise ValueError("Something went wrong!")
         yield 0  # have to make sure it's an async coroutine
 
-    background_iterable = BackgroundIterable(async_producer_with_exception, max_capacity=10)
+    background_iterable = BackgroundIterable(async_producer_with_exception, max_capacity=max_capacity)
 
     with pytest.raises(ValueError):
         for _ in background_iterable:
@@ -112,21 +120,24 @@ async def async_producer_with_exception():
 
 
 @pytest.mark.asyncio
-async def test_async_stop_event():
+@pytest.mark.parametrize("max_capacity", [-1, None, 10])
+async def test_async_stop_event(max_capacity):
     async def ongoing_async_process():
         while True:
             for item in range(1, 101):
                 yield item
 
-    background_iterable = BackgroundIterable(ongoing_async_process, max_capacity=10)
+    background_iterable = BackgroundIterable(ongoing_async_process, max_capacity=max_capacity)
 
     iter1 = iter(background_iterable)
 
     for _ in range(5):
-        next(iter1)
+        q = next(iter1)
+        print(q)
 
     iter1.stop()
 
+    # this doesn't work b/c pytest is stupid
     with pytest.raises(StopIteration):
-        await next(iter1)
-        await next(iter1)
+        next(iter1)
+        next(iter1)

From b6f334e9730af969cc0b42d42d4634a71bcaa345 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Fri, 13 Sep 2024 00:04:24 -0700
Subject: [PATCH 54/94] get rid of eraconfig b/c draccus can't handle it

---
 config/llama2_small_fast_mix.yaml           |  3 +--
 src/levanter/data/permutation.py            | 17 +++++++++--------
 src/levanter/data/text.py                   | 17 +++++++++--------
 src/levanter/store/stress_test_new_cache.py | 15 +++++++--------
 4 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/config/llama2_small_fast_mix.yaml b/config/llama2_small_fast_mix.yaml
index aabd17fae..29b0a8a52 100644
--- a/config/llama2_small_fast_mix.yaml
+++ b/config/llama2_small_fast_mix.yaml
@@ -1,8 +1,7 @@
 data:
   tokenizer: "meta-llama/Llama-2-7b-hf"
   cache_dir: "gs://levanter-data/new-tokenized/pile_mix/"
-  shuffle:
-    era_length: 10000
+  shuffle: 10000
   configs:
     arxiv:
       train_urls:
diff --git a/src/levanter/data/permutation.py b/src/levanter/data/permutation.py
index a0f0566f4..6599d4974 100644
--- a/src/levanter/data/permutation.py
+++ b/src/levanter/data/permutation.py
@@ -1,4 +1,3 @@
-import dataclasses
 from typing import Optional, Sequence
 
 import jax.random
@@ -29,7 +28,11 @@ def is_finite(self) -> bool:
         return self.dataset.is_finite()
 
     async def current_len(self) -> Optional[int]:
-        return await self.dataset.current_len()
+        if await self.final_length_is_known():
+            return await self.async_len()
+        # In general, we can't know the current length until we know the entire length
+        return None
+        # return await self.dataset.current_len()
 
     async def getitem_async(self, index: int) -> T_co:
         permutation = await self._get_permutation()
@@ -41,9 +44,12 @@ async def get_batch(self, indices: Sequence[int]) -> Sequence[T_co]:
 
     async def _get_permutation(self):
         if self._permutation is None:
-            self._permutation = Permutation(await self.dataset.async_len(), self.key)
+            self._permutation = Permutation(await self.async_len(), self.key)
         return self._permutation
 
+    async def wait_until_len_at_least(self, length: int) -> int:
+        return await self.async_len()
+
 
 class EraShufflingDataset(AsyncDataset[T_co]):
     """
@@ -128,8 +134,3 @@ async def wait_until_len_at_least(self, length: int) -> int:
         # wait until we hit the next era
         next_era_end = (length // self.era_length + 1) * self.era_length
         return await self.dataset.wait_until_len_at_least(next_era_end)
-
-
-@dataclasses.dataclass
-class EraConfig:
-    era_length: int
diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
index fc9ce8052..ad03f6d01 100644
--- a/src/levanter/data/text.py
+++ b/src/levanter/data/text.py
@@ -29,7 +29,6 @@
 from levanter.data import AsyncDataset
 from levanter.data.dataset import MappedAsyncDataset
 from levanter.data.mixture import MixtureDataset, StopStrategy
-from levanter.data.permutation import EraConfig
 
 # intercept the logging nonsense here
 from levanter.logging import silence_transformer_nag  # noqa
@@ -103,17 +102,19 @@ async def current_len(self) -> Optional[int]:
     async def get_batch(self, indices: Sequence[int]) -> Sequence[T_co]:
         token_arrays = await self._await_token_cache()
         # logger.info(f"Time to get token cache: {time.time() - time_in}")
+        print(f"waiting until len is at least {max(indices) + 1}")
         len = await self.wait_until_len_at_least(max(indices) + 1)
         if len is not None and len < max(indices) + 1:
             raise ValueError("Requested indices beyond the end of the dataset")
         offsets = np.array(indices) * self.seq_len
+        print(f"getting offsets {offsets}")
         with ts.Batch():
             out = []
             for offset in offsets:
                 out.append(token_arrays.data[offset : offset + self.seq_len].read())
 
         out = await asyncio.gather(*out)
-
+        print("done waiting")
         return out
 
     def get_batch_sync(self, indices: Sequence[int]) -> Sequence[T_co]:
@@ -549,9 +550,9 @@ class LMTaskConfig(abc.ABC):
     enforce_eos: bool = True  # whether to append eos even if the tokenizer doesn't
 
     ignore_token_id: Optional[int] = None
-    shuffle: bool | EraConfig = False
+    shuffle: bool | int = False
     """whether to shuffle the dataset. True means shuffle the whole dataset, False means don't shuffle.
-    If you want to shuffle in eras, provide an EraConfig (which asks for an era_length)"""
+    If you want to shuffle in eras, set this to the era length"""
 
     @cached_property
     def the_tokenizer(self) -> PreTrainedTokenizerBase:
@@ -599,8 +600,8 @@ def train_set(
 
         if self.shuffle is True:
             ds = ds.shuffle(key)
-        elif isinstance(self.shuffle, EraConfig):
-            ds = ds.era_shuffle(self.shuffle.era_length, key=key)
+        elif isinstance(self.shuffle, int):
+            ds = ds.era_shuffle(self.shuffle, key=key)
 
         return ds  # type: ignore
 
@@ -754,8 +755,8 @@ def train_set(
         def shuffle_ds(ds, key):
             if self.shuffle is True:
                 ds = ds.shuffle(key)
-            elif isinstance(self.shuffle, EraConfig):
-                ds = ds.era_shuffle(self.shuffle.era_length, key=key)
+            elif isinstance(self.shuffle, int):
+                ds = ds.era_shuffle(self.shuffle, key=key)
 
             return ds
 
diff --git a/src/levanter/store/stress_test_new_cache.py b/src/levanter/store/stress_test_new_cache.py
index c583ede56..66d002abd 100644
--- a/src/levanter/store/stress_test_new_cache.py
+++ b/src/levanter/store/stress_test_new_cache.py
@@ -109,14 +109,13 @@ def ensure_cache(new_cache_path):
 if __name__ == "__main__":
     import sys
 
-    if not len(sys.argv) == 3:
-        print("Usage: convert_to_new_cache.py old_cache_path new_cache_path")
+    if not len(sys.argv) == 2:
+        print("Usage: convert_to_new_cache.py new_cache_path")
         sys.exit(1)
 
     for split in ["validation", "train"]:
         print(f"Split: {split}", flush=True)
-        in_path = os.path.join(sys.argv[1], split)
-        out_path = os.path.join(sys.argv[2], split)
+        cache_path = os.path.join(sys.argv[1], split)
         # convert_to_new_cache(in_path, out_path)
         # with capture_time() as time_fn:
         #     bench_old_cache(in_path)
@@ -126,24 +125,24 @@ def ensure_cache(new_cache_path):
         exemplar = {"input_ids": np.zeros((SEQ_LEN,), dtype=np.int32)}
 
         with capture_time() as time_fn:
-            bench_new_cache_serial(exemplar, out_path)
+            bench_new_cache_serial(exemplar, cache_path)
         tokens_per_second = SEQ_LEN * BS * BATCHES / time_fn()
         print(f"New Cache Serial: {time_fn()} ({tokens_per_second} tps)", flush=True)
 
         with capture_time() as time_fn:
-            asyncio.run(bench_new_cache_serial_tokenseq(exemplar, out_path))
+            asyncio.run(bench_new_cache_serial_tokenseq(exemplar, cache_path))
         tokens_per_second = SEQ_LEN * BS * BATCHES / time_fn()
 
         print(f"New Cache Serial TokenSeq: {time_fn()} ({tokens_per_second} tps)", flush=True)
 
         with capture_time() as time_fn:
-            bench_new_cache_random(exemplar, out_path)
+            bench_new_cache_random(exemplar, cache_path)
         tokens_per_second = SEQ_LEN * BS * BATCHES / time_fn()
 
         print(f"New Cache Random: {time_fn()} ({tokens_per_second} tps)", flush=True)
 
         with capture_time() as time_fn:
-            asyncio.run(bench_new_cache_permutation_random(exemplar, out_path))
+            asyncio.run(bench_new_cache_permutation_random(exemplar, cache_path))
         tokens_per_second = SEQ_LEN * BS * BATCHES / time_fn()
 
         print(f"New Cache Permutation: {time_fn()} ({tokens_per_second} tps)", flush=True)

From e33a90552df4532b1bc0c2de39d1aa8c829a8331 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Fri, 13 Sep 2024 00:48:59 -0700
Subject: [PATCH 55/94] ugh

---
 src/levanter/data/text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
index ad03f6d01..7e25c88ef 100644
--- a/src/levanter/data/text.py
+++ b/src/levanter/data/text.py
@@ -600,7 +600,7 @@ def train_set(
 
         if self.shuffle is True:
             ds = ds.shuffle(key)
-        elif isinstance(self.shuffle, int):
+        elif isinstance(self.shuffle, int) and self.shuffle > 0:
             ds = ds.era_shuffle(self.shuffle, key=key)
 
         return ds  # type: ignore

From 2645efbe904dd70b2546441c477244e083788c7c Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Fri, 13 Sep 2024 00:59:51 -0700
Subject: [PATCH 56/94] missed some prints

---
 src/levanter/data/text.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
index 7e25c88ef..20a11d090 100644
--- a/src/levanter/data/text.py
+++ b/src/levanter/data/text.py
@@ -102,19 +102,16 @@ async def current_len(self) -> Optional[int]:
     async def get_batch(self, indices: Sequence[int]) -> Sequence[T_co]:
         token_arrays = await self._await_token_cache()
         # logger.info(f"Time to get token cache: {time.time() - time_in}")
-        print(f"waiting until len is at least {max(indices) + 1}")
         len = await self.wait_until_len_at_least(max(indices) + 1)
         if len is not None and len < max(indices) + 1:
             raise ValueError("Requested indices beyond the end of the dataset")
         offsets = np.array(indices) * self.seq_len
-        print(f"getting offsets {offsets}")
         with ts.Batch():
             out = []
             for offset in offsets:
                 out.append(token_arrays.data[offset : offset + self.seq_len].read())
 
         out = await asyncio.gather(*out)
-        print("done waiting")
         return out
 
     def get_batch_sync(self, indices: Sequence[int]) -> Sequence[T_co]:

From 5fc4084c4d0cfe040f88102e6240c46096698644 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Fri, 13 Sep 2024 16:19:26 -0700
Subject: [PATCH 57/94] attempt at launching small fast in CI, add
 tqdm_loggable (#719)

---
 .github/workflows/docker-base-image.yaml |  9 ++-
 .github/workflows/launch_small_fast.yaml | 72 ++++++++++++++++++++++++
 infra/launch.py                          | 45 ++++++++++-----
 pyproject.toml                           |  1 +
 src/levanter/callbacks.py                | 19 ++++++-
 src/levanter/eval.py                     |  4 +-
 src/levanter/infra/docker.py             |  7 ++-
 src/levanter/infra/tpus.py               | 34 ++++++++---
 8 files changed, 163 insertions(+), 28 deletions(-)
 create mode 100644 .github/workflows/launch_small_fast.yaml

diff --git a/.github/workflows/docker-base-image.yaml b/.github/workflows/docker-base-image.yaml
index a5ada69c3..a5e6c3724 100644
--- a/.github/workflows/docker-base-image.yaml
+++ b/.github/workflows/docker-base-image.yaml
@@ -1,9 +1,12 @@
 name: Build and Push Docker TPU Images
 
 on:
-  push:
-    branches:
-      - main
+  workflow_run:
+    workflows: ["Run Tests"]
+    types:
+      - completed
+    branches: [main]
+  workflow_dispatch:
 
 jobs:
   build:
diff --git a/.github/workflows/launch_small_fast.yaml b/.github/workflows/launch_small_fast.yaml
new file mode 100644
index 000000000..15f423674
--- /dev/null
+++ b/.github/workflows/launch_small_fast.yaml
@@ -0,0 +1,72 @@
+name: Launch Llama 2 Small Fast
+
+on:
+  workflow_run:
+    workflows: ["Build and Push Docker TPU Images"]
+    types:
+      - completed
+    branches: [main, "experiment/*"]
+#  pull_request:
+  workflow_dispatch:
+
+jobs:
+  test:
+    if: (github.event.pull_request.head.repo.full_name == github.repository)
+    runs-on: ubuntu-latest
+    env:
+      TPU_ZONE: "us-central2-b"
+      TPU_TYPE: "v4-32"
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      - name: Set up Google Cloud SDK
+        uses: google-github-actions/setup-gcloud@v1
+        with:
+          project_id: ${{ secrets.GCP_PROJECT_ID }}
+
+      - name: Authenticate to Google Cloud
+        uses: google-github-actions/auth@v1
+        with:
+          credentials_json: ${{ secrets.GCP_SA_KEY }}
+
+      - name: Configure Google Cloud
+        run: |
+          gcloud config set project ${{ secrets.GCP_PROJECT_ID }}
+          REGION=${TPU_ZONE%-*}
+          echo "$REGION"
+          gcloud auth configure-docker  $REGION-docker.pkg.dev
+
+      - name: Install locally
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e .[test] "jax[cpu]==0.4.30"
+
+      - name: Launch Small Fast TPU Train LM job
+        run: |
+          export TPU_NAME=small-fast-${{ github.run_id }}
+          export WANDB_API_KEY=${{ secrets.WANDB_API_KEY }}
+          export RUN_ID=small_fast_${{ github.run_id }}
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
+          
+          cat > .config <<EOF
+          env:
+              WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
+              WANDB_ENTITY: stanford-mercury
+              WANDB_PROJECT: levanter
+              HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          EOF
+          
+          python infra/launch.py -e CI 1 --foreground --tpu_name ${TPU_NAME} --run_id $RUN_ID --zone ${TPU_ZONE} --tpu_type ${TPU_TYPE} --preemptible -- \
+             python -m levanter.main.train_lm \
+               --config_path config/llama_small_fast.yaml \
+               --trainer.checkpointer.base_path gs://levanter-checkpoints/llama-itest/ \
+               --trainer.checkpointer.save_interval 10m
+               --trainer.num_train_steps 10000
+
+      - name: Cleanup
+        if: ${{ always() }}
+        run: |
+          export TPU_NAME=small-fast-${{ github.run_id }}
+          gcloud compute tpus queued-resources delete $TPU_NAME --zone ${TPU_ZONE} --quiet --force
\ No newline at end of file
diff --git a/infra/launch.py b/infra/launch.py
index 2adb110d3..ec241fcec 100755
--- a/infra/launch.py
+++ b/infra/launch.py
@@ -110,6 +110,8 @@ def main():
     else:
         raise ValueError(f"Unknown docker registry: {registry}")
 
+    failure = None
+
     for i in range(retries + 1):
         try:
             launch_job(
@@ -128,24 +130,39 @@ def main():
             print(f"Error running command {e.cmd}")
             if i < retries - 1:
                 print("Retrying... %d/%d" % (i + 1, retries))
+            else:
+                print("Retries exhausted. Raising error.")
+                print(f"Error running command {e.cmd}")
+                print(f"Output: {e.output}")
+                failure = e
         else:
             print("Job finished with no error.")
             break
 
-    if autodelete:
-        print("Autodelete is set to True. Tearing down machine...")
-        levanter.infra.tpus.run_command(
-            "gcloud",
-            "alpha",
-            "compute",
-            "tpus",
-            "queued-resources",
-            "delete",
-            tpu_name,
-            "--quiet",
-            f"--zone={zone}",
-            "--force",
-        )
+    try:
+        if autodelete:
+            print("Autodelete is set to True. Tearing down machine...")
+            levanter.infra.tpus.run_command(
+                "gcloud",
+                "alpha",
+                "compute",
+                "tpus",
+                "queued-resources",
+                "delete",
+                tpu_name,
+                "--quiet",
+                f"--zone={zone}",
+                "--force",
+            )
+    except Exception as e:
+        print(f"Error tearing down TPU {tpu_name}: {e}")
+        if failure:
+            raise failure
+        else:
+            raise e
+
+    if failure:
+        raise failure
 
 
 if __name__ == "__main__":
diff --git a/pyproject.toml b/pyproject.toml
index de85d287e..8f65f071f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,6 +50,7 @@ dependencies = [
     "filelock~=3.13",
     #    "ai2-olmo",
     "async-lru~=2.0",
+    "tqdm-loggable>=0.2"
 ]
 
 [project.urls]
diff --git a/src/levanter/callbacks.py b/src/levanter/callbacks.py
index 21aaf5faa..e03add43d 100644
--- a/src/levanter/callbacks.py
+++ b/src/levanter/callbacks.py
@@ -8,11 +8,13 @@
 import threading
 import time
 import warnings
+from datetime import timedelta
 from typing import Callable, Optional
 
 import humanfriendly
 import jax
-from tqdm import tqdm
+from tqdm_loggable import tqdm_logging
+from tqdm_loggable.auto import tqdm
 
 import levanter.tracker
 from levanter.data import DataLoader
@@ -39,7 +41,9 @@ def eval_loss_loop(loss_fn, model, dataset, max_batches: Optional[int] = None, n
     else:
         desc = "eval"
 
+    _tqdm_logging_one_time_setup()
     pbar = tqdm(dataset, desc=desc, position=1, leave=False, total=max_batches)
+
     iter_ = iter(pbar)
     while True:
         time_in = time.time()
@@ -186,6 +190,8 @@ def pbar_logger(iterable=None, desc="train", **tqdm_mkwargs):
         kwargs["desc"] = desc
     if "iterable" not in kwargs:
         kwargs["iterable"] = iterable
+
+    _tqdm_logging_one_time_setup()
     pbar = tqdm(**kwargs)
 
     def update_pbar(step: StepInfo):
@@ -359,3 +365,14 @@ def compute_and_viz_log_probs(step: StepInfo):
         wandb.log({"log_probs": wandb.Html(path)}, step=step.step)
 
     return compute_and_viz_log_probs
+
+
+_did_tqdm_logging_one_time_setup = False
+
+
+def _tqdm_logging_one_time_setup():
+    global _did_tqdm_logging_one_time_setup
+    if _did_tqdm_logging_one_time_setup:
+        return
+    _did_tqdm_logging_one_time_setup = True
+    tqdm_logging.tqdm_logging.set_log_rate(timedelta(seconds=60))
diff --git a/src/levanter/eval.py b/src/levanter/eval.py
index 48fcb426c..2aa9b7ff3 100644
--- a/src/levanter/eval.py
+++ b/src/levanter/eval.py
@@ -8,8 +8,8 @@
 import jax.numpy as jnp
 import jmp
 import numpy as np
-import tqdm
 from jax.sharding import Mesh
+from tqdm_loggable.auto import tqdm
 
 import haliax as hax
 from haliax.partitioning import ResourceMapping
@@ -300,7 +300,7 @@ def evaluate(self, m: LmHeadModel):
         iterator = LoadingTimeTrackerIterator(self.loader)
         n = 0
 
-        for batch, tags in tqdm.tqdm(iterator, "eval"):
+        for batch, tags in tqdm(iterator, "eval"):
             state = self.accum_for_batch(m, state, batch, tags)
             n += 1
 
diff --git a/src/levanter/infra/docker.py b/src/levanter/infra/docker.py
index 63a51ae2f..39f3b1325 100644
--- a/src/levanter/infra/docker.py
+++ b/src/levanter/infra/docker.py
@@ -65,7 +65,12 @@ def read(fd):
 
         return b"".join(output)
     else:
-        return subprocess.check_output(argv, stderr=subprocess.STDOUT)
+        try:
+            return subprocess.check_output(argv, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError as e:
+            # print the output if the command failed, reraising the exception
+            print(e.output.decode())
+            raise e
 
 
 def configure_gcp_docker(project_id, region, repository):
diff --git a/src/levanter/infra/tpus.py b/src/levanter/infra/tpus.py
index 7e630f069..b8a8df9e0 100644
--- a/src/levanter/infra/tpus.py
+++ b/src/levanter/infra/tpus.py
@@ -49,6 +49,7 @@ def list_tpus(zone):
                 "list",
                 f"--zone={zone}",
                 "--format=json(name.basename(), state)",
+                "--quiet",
             ]
         )
     )
@@ -68,6 +69,7 @@ def describe_tpu(tpu_name, zone):
                     tpu_name,
                     f"--zone={zone}",
                     "--format=json(name.basename(), state)",
+                    "--quiet",
                 ],
                 stderr=subprocess.DEVNULL,
             )
@@ -77,6 +79,8 @@ def describe_tpu(tpu_name, zone):
 
 
 def start_tpu_vm(tpu_name, *, tpu_type, capacity_type, version, zone, node_count):
+    # ensure alpha is enabled
+    run_command("gcloud", "components", "install", "alpha", "--quiet")
     if version is None:
         version = "tpu-ubuntu2204-base"
     tpu_stat = describe_tpu(tpu_name, zone)
@@ -196,17 +200,31 @@ def run_command(*args, **kwargs):
 
 def add_ssh_key(ssh_key_filename):
     # format 3072 SHA256:... key-name (RSA)
-    key_hash = subprocess.check_output(["ssh-keygen", "-lf", ssh_key_filename]).decode("utf-8").split()[1]
-    existing_keys = subprocess.check_output(["ssh-add", "-l"]).decode("utf-8").split("\n")
-    for key in existing_keys:
-        if key_hash in key:
-            return
+    try:
+        key_hash = (
+            subprocess.check_output(["ssh-keygen", "-lf", ssh_key_filename], stderr=subprocess.STDOUT)
+            .decode("utf-8")
+            .split()[1]
+        )
+        existing_keys = (
+            subprocess.check_output(["ssh-add", "-l"], stderr=subprocess.STDOUT).decode("utf-8").split("\n")
+        )
+        for key in existing_keys:
+            if key_hash in key:
+                return
 
-    subprocess.check_call(["ssh-add", ssh_key_filename])
+            subprocess.check_call(["ssh-add", ssh_key_filename])
+    except subprocess.CalledProcessError:
+        raise
 
 
 def tpu_ssh(tpu_name, zone, node_count, *args, ignore_failure=False):
-    add_ssh_key(os.path.expanduser("~/.ssh/google_compute_engine"))
+    try:
+        add_ssh_key(os.path.expanduser("~/.ssh/google_compute_engine"))
+    except subprocess.CalledProcessError as e:
+        print("Failed to add ssh key. This may lead to problems.", e)
+        pass
+
     try:
         if node_count > 1:
             return _tpu_ssh_multislice(tpu_name, zone, node_count, *args, ignore_failure=ignore_failure)
@@ -219,6 +237,7 @@ def tpu_ssh(tpu_name, zone, node_count, *args, ignore_failure=False):
             "tpu-vm",
             "ssh",
             tpu_name,
+            "--quiet",
             "--worker=all",
             f"--zone={zone}",
             "--command=%s" % " ".join(args),
@@ -243,6 +262,7 @@ def _tpu_ssh_multislice(tpu_name, zone, node_count, *args, ignore_failure=False)
                 "ssh",
                 f"{tpu_name}-{i}",
                 "--worker=all",
+                "--quiet",
                 f"--zone={zone}",
                 "--command=%s" % " ".join(args),
             )

From d05036cc3bfb753e25bab95d17a96fc5fa9b6563 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 16 Sep 2024 15:21:48 -0700
Subject: [PATCH 58/94] Update datasets requirement from ~=2.18 to >=2.18,<4.0
 (#732)

Updates the requirements on [datasets](https://github.com/huggingface/datasets) to permit the latest version.
- [Release notes](https://github.com/huggingface/datasets/releases)
- [Commits](https://github.com/huggingface/datasets/compare/2.18.0...3.0.0)

---
updated-dependencies:
- dependency-name: datasets
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8f65f071f..9604672dc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ dependencies = [
     "draccus>=0.8.0",
     "pyarrow>=11.0.0",
     "zstandard>=0.20.0",
-    "datasets~=2.18",
+    "datasets>=2.18,<4.0",
     "gcsfs>=2024.2,<2024.10",
     "braceexpand>=0.1.7",
     "jmp>=0.0.3",

From ca16aa06cc438aabc85064b324bb8d3213a86bdb Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 16 Sep 2024 15:23:08 -0700
Subject: [PATCH 59/94] Bump tensorstore from 0.1.64 to 0.1.65 (#731)

Bumps [tensorstore](https://github.com/google/tensorstore) from 0.1.64 to 0.1.65.
- [Commits](https://github.com/google/tensorstore/compare/v0.1.64...v0.1.65)

---
updated-dependencies:
- dependency-name: tensorstore
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9604672dc..60fcdcc52 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ dependencies = [
     "braceexpand>=0.1.7",
     "jmp>=0.0.3",
     "fsspec[http]>=2024.2,<2024.10",
-    "tensorstore==0.1.64",
+    "tensorstore==0.1.65",
     "pytimeparse>=1.1.8",
     "humanfriendly==10.0",
     "safetensors[numpy]~=0.4.2",

From 79fa64c2f4b9ba96bfe7bd5b5ff24280fad8a29e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 16 Sep 2024 15:25:49 -0700
Subject: [PATCH 60/94] Bump equinox from 0.11.3 to 0.11.6 (#730)

Bumps [equinox](https://github.com/patrick-kidger/equinox) from 0.11.3 to 0.11.6.
- [Release notes](https://github.com/patrick-kidger/equinox/releases)
- [Commits](https://github.com/patrick-kidger/equinox/compare/v0.11.3...v0.11.6)

---
updated-dependencies:
- dependency-name: equinox
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 60fcdcc52..0b72b20f4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,7 @@ classifiers = [
 ]
 dependencies = [
     "haliax>=1.4.dev307",
-    "equinox==0.11.3",
+    "equinox==0.11.6",
     "jaxtyping>=0.2.20",
     "tokenizers>=0.15.2",
     "transformers>=4.41.2",

From 07b3f1639ec4b2f05779733109a2906378e1af5a Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Tue, 17 Sep 2024 22:28:09 -0700
Subject: [PATCH 61/94] add bits-per-byte calculation to levanter (#729)

---
 src/levanter/eval.py               | 136 ++++++++++++++++++++++++-----
 src/levanter/main/train_lm.py      |   1 +
 src/levanter/utils/hf_utils.py     |  30 +++++++
 src/levanter/utils/stat_utils.py   |   2 +-
 src/levanter/utils/thread_utils.py |  23 ++++-
 tests/test_hf_utils.py             |  65 ++++++++++++++
 tests/test_utils.py                |   6 +-
 tests/tiny_test_corpus.py          |   2 +-
 8 files changed, 235 insertions(+), 30 deletions(-)

diff --git a/src/levanter/eval.py b/src/levanter/eval.py
index 2aa9b7ff3..555dd1466 100644
--- a/src/levanter/eval.py
+++ b/src/levanter/eval.py
@@ -5,6 +5,7 @@
 from collections import defaultdict
 from typing import Callable, Mapping, Optional, Sequence, TypeVar
 
+import equinox as eqx
 import jax.numpy as jnp
 import jmp
 import numpy as np
@@ -19,7 +20,8 @@
 from levanter.logging import LoadingTimeTrackerIterator
 from levanter.models.lm_model import LmExample, LmHeadModel, compute_next_token_loss
 from levanter.trainer import StepInfo
-from levanter.utils.stat_utils import RunningMean
+from levanter.utils.hf_utils import HfTokenizer, byte_length_of_token
+from levanter.utils.stat_utils import Arrayish, RunningMean
 from levanter.utils.tree_utils import inference_mode
 
 
@@ -37,6 +39,10 @@ class EvalResult:
     tag_macro_losses: dict[str, float]  # per tag average-per-token loss
     tag_micro_losses: dict[str, float]  # per tag total loss, for "parent" tags
     total_eval_loading_time: float
+    micro_bpb: Optional[float] = None
+    macro_bpb: Optional[float] = None
+    tag_macro_bpb: Optional[dict[str, float]] = None
+    tag_micro_bpb: Optional[dict[str, float]] = None
 
 
 # This class doesn't try to be async or work with incomplete datasets, because it's eval
@@ -152,6 +158,7 @@ def _join_prefix(prefix: str, tag: str) -> str:
 def cb_tagged_lm_evaluate(
     EvalBatch: hax.Axis,
     tagged_eval_sets: Sequence[tuple[AsyncDataset[LmExample], Sequence[str]]],
+    tokenizer: Optional[HfTokenizer] = None,
     device_mesh: Optional[Mesh] = None,
     axis_mapping: ResourceMapping = None,
     max_examples_per_dataset: Optional[int] = None,
@@ -173,12 +180,15 @@ def cb_tagged_lm_evaluate(
     Args:
         EvalBatch: The axis for the evaluation batch (mostly for the batch size)
         tagged_eval_sets: A list of datasets, each with its own domain tag
+        tokenizer: The tokenizer to use for bits-per-byte evaluation (optional)
         device_mesh: The mesh to use for evaluation
         axis_mapping: The axis mapping to use for evaluation
+        max_examples_per_dataset: The maximum number of examples to use from each dataset
+        prefix: The prefix to use for logging the losses
     """
 
     evaluator = TaggedEvaluator(
-        EvalBatch, tagged_eval_sets, device_mesh, axis_mapping, max_examples_per_dataset, mp=mp
+        EvalBatch, tagged_eval_sets, tokenizer, device_mesh, axis_mapping, max_examples_per_dataset, mp=mp
     )
 
     def eval_callback(step: StepInfo):
@@ -213,6 +223,14 @@ def eval_callback(step: StepInfo):
                 log_dict[_join_prefix(prefix, tag) + "/micro_loss"] = loss
                 logger.info(f"{tag} micro loss: {loss:.3f}")
 
+        if tokenizer is not None:
+            log_dict[_join_prefix(prefix, "bpb")] = result.micro_bpb
+            log_dict[_join_prefix(prefix, "macro_bpb")] = result.macro_bpb
+            for tag, bpb in result.tag_micro_bpb.items():
+                log_dict[_join_prefix(prefix, tag) + "/bpb"] = bpb
+            for tag, bpb in result.tag_macro_bpb.items():
+                log_dict[_join_prefix(prefix, tag) + "/macro_bpb"] = bpb
+
         levanter.tracker.log_metrics(log_dict, step=step.step)
 
         return result
@@ -225,6 +243,8 @@ class TaggedEvaluator:
     Evaluates multiple tagged datasets using a given evaluation function.
     Scores for each tag are aggregated and logged separately, as well as getting an overall score.
 
+    TaggedEvaluator computes both log-perplexity and bits-per-byte for each tag, if a tokenizer is provided.
+
     Tags are arranged hierarchically with "/" as separator, and we log both a micro and macro average loss
     for each tag.
 
@@ -234,6 +254,7 @@ def __init__(
         self,
         EvalBatch: hax.Axis,
         tagged_eval_sets: Sequence[tuple[AsyncDataset, Sequence[str]]],
+        tokenizer: Optional[HfTokenizer] = None,
         device_mesh=None,
         axis_mapping=None,
         max_examples_per_dataset=None,
@@ -249,6 +270,8 @@ def __init__(
             axis_resources=axis_mapping,
         )
         self.mp = mp
+        self.tokenizer = tokenizer
+        self.bytes_per_token = self._calculate_bytes_per_token_type(tokenizer)
 
         # tags are arranged hierarchically with "/" as separator. We want to log the average loss for each tag.
         hierarchy: dict[str, list[int]] = {}
@@ -264,29 +287,45 @@ def __init__(
         self.hierarchy = hierarchy
 
         @hax.named_jit(out_axis_resources=axis_mapping)
-        def accum_for_batch(
-            m: LmHeadModel, state: tuple[RunningMean, RunningMean], batch: LmExample, tags: hax.NamedArray
-        ):
+        def accum_for_batch(m: LmHeadModel, state: _EvalRunningMeans, batch: LmExample, tags: hax.NamedArray):
             m = inference_mode(m, True)
 
             if self.mp is not None:
                 m = self.mp.cast_to_compute(m)
+
             with hax.axis_mapping(axis_mapping):
-                total_mean, mean_per_tag = state
                 losses = compute_next_token_loss(m, batch, reduction=None, reduction_axis=())
-                mask = batch.loss_mask  # [Batch, Token]
+                mask = batch.loss_mask  # [Batch, Pos]
                 this_tokens = hax.sum(mask)
                 this_loss = hax.einsum("->", losses, mask)  # to scalar
 
+                # all the *_per_tag variables are [Tag]
                 this_tokens_per_tag = hax.einsum("-> tag", mask, tags)
                 this_loss_per_tag = hax.einsum("-> tag", mask, losses, tags)  # [Tag]
 
-                mean = total_mean.add(this_loss / this_tokens, this_tokens)
+                mean = state.token_avg_loss.add(this_loss / this_tokens, this_tokens)
                 # careful: this_tokens_per_tag can be 0 if there are no tokens for that tag
                 safe_mean = hax.where(this_tokens_per_tag, this_loss_per_tag / this_tokens_per_tag, 0.0)
-                mean_per_tag = mean_per_tag.add(safe_mean, this_tokens_per_tag)
+                mean_per_tag = state.loss_per_tag.add(safe_mean, this_tokens_per_tag)
+
+                state = dataclasses.replace(state, token_avg_loss=mean, loss_per_tag=mean_per_tag)
+
+                if self.bytes_per_token is not None:
+                    next_tokens = hax.roll(batch.tokens, -1, m.Pos)  # [Batch, Pos], rolled by 1 for next token task
+                    bytes_per_pos = self.bytes_per_token.take("vocab", next_tokens)  # [Batch, Pos]
+                    bytes_per_pos = bytes_per_pos * mask  # [Batch, Pos]
+                    bytes_per_tag = hax.einsum("-> tag", bytes_per_pos, tags)  # [Tag]
+                    total_bytes = hax.sum(bytes_per_tag)
 
-            return mean, mean_per_tag
+                    # log loss -> bits is log2(e) * loss
+                    bpb_per_tag = this_loss_per_tag / hax.maximum(bytes_per_tag, 1) * jnp.log2(jnp.e)
+                    bpb = this_loss / hax.maximum(total_bytes, 1) * jnp.log2(jnp.e)
+
+                    bpb_mean = state.bpb.add(bpb, this_tokens)
+                    bpb_per_tag_mean = state.bpb_per_tag.add(bpb_per_tag, this_tokens_per_tag)
+                    state = dataclasses.replace(state, bpb=bpb_mean, bpb_per_tag=bpb_per_tag_mean)
+
+            return state
 
         self.accum_for_batch = accum_for_batch
 
@@ -294,7 +333,8 @@ def evaluate(self, m: LmHeadModel):
         total_loss = jnp.zeros(())
         mean_losses_per_tag = hax.zeros(self.dataset.Tag, dtype=np.float32)
 
-        state = (RunningMean.zeros_like(total_loss), RunningMean.zeros_like(mean_losses_per_tag))
+        state = _EvalRunningMeans.zeros_like(total_loss, mean_losses_per_tag)
+        del total_loss, mean_losses_per_tag
         state = hax.shard(state)
 
         iterator = LoadingTimeTrackerIterator(self.loader)
@@ -304,19 +344,30 @@ def evaluate(self, m: LmHeadModel):
             state = self.accum_for_batch(m, state, batch, tags)
             n += 1
 
-        total_loss, losses_per_tag = state
-
-        micro_avg_loss = total_loss.mean.item()
-        tag_avg_loss = losses_per_tag.mean
+        micro_avg_loss = state.token_avg_loss.mean.item()
+        tag_avg_loss = state.loss_per_tag.mean
 
         # TODO: why do i have to jit this
         macro_avg_loss = hax.named_jit(lambda x: hax.mean(x).array)(tag_avg_loss).item()
 
-        tag_macro_loss = {}
-        tag_micro_loss = {}
+        if self.bytes_per_token is not None:
+            micro_bpb = state.bpb.mean.item()
+            tag_avg_bpb = state.bpb_per_tag.mean
+            macro_avg_bpb = hax.named_jit(lambda x: hax.mean(x).array)(tag_avg_bpb).item()
+        else:
+            micro_bpb = None
+            macro_avg_bpb = None
+
+        tag_macro_loss: dict[str, float] = {}
+        tag_micro_loss: dict[str, float] = {}
+        tag_macro_bpb: dict[str, float] = {}
+        tag_micro_bpb: dict[str, float] = {}
 
-        mean_loss_per_tag_cpu = np.array(losses_per_tag.mean.array)  # type: ignore
-        total_tokens_per_tag_cpu = np.array(losses_per_tag.total.array)  # type: ignore
+        mean_loss_per_tag_cpu = np.array(state.loss_per_tag.mean.array)
+        total_tokens_per_tag_cpu = np.array(state.loss_per_tag.mean.array)
+
+        mean_bits_per_tag_cpu = np.array(state.bpb_per_tag.mean.array)
+        total_bytes_per_tag_cpu = np.array(state.bpb_per_tag.mean.array)
 
         # add in the hierarchy
         for parent, children in self.hierarchy.items():
@@ -333,8 +384,51 @@ def evaluate(self, m: LmHeadModel):
             # (average doesn't support where directly so we just 0 out the weights)
             tag_micro_loss[parent] = np.average(mean_loss_per_tag_cpu, weights=total_tokens_per_tag_cpu * mask)
 
+            if self.bytes_per_token is not None:
+                tag_macro_bpb[parent] = np.mean(mean_bits_per_tag_cpu, where=mask)
+                tag_micro_bpb[parent] = np.average(mean_bits_per_tag_cpu, weights=total_bytes_per_tag_cpu * mask)
+
         for tag, index in self.dataset.tag_to_index.items():
-            tag_micro_loss[tag] = mean_loss_per_tag_cpu[index]
+            tag_micro_loss[tag] = float(mean_loss_per_tag_cpu[index])
             # no macro loss for the leaf tags
 
-        return EvalResult(micro_avg_loss, macro_avg_loss, tag_macro_loss, tag_micro_loss, iterator.total_time)
+            if self.bytes_per_token is not None:
+                tag_micro_bpb[tag] = float(mean_bits_per_tag_cpu[index])
+
+        return EvalResult(
+            micro_avg_loss,
+            macro_avg_loss,
+            tag_macro_loss,
+            tag_micro_loss,
+            iterator.total_time,
+            micro_bpb,
+            macro_avg_bpb,
+            tag_macro_bpb,
+            tag_micro_bpb,
+        )
+
+    def _calculate_bytes_per_token_type(self, tokenizer: HfTokenizer) -> Optional[hax.NamedArray]:
+        if tokenizer is None:
+            return None
+        else:
+            # calculate the number of bytes in each token
+            Vocab = hax.Axis("vocab", len(tokenizer.get_vocab()))
+            bytes = np.ndarray((Vocab.size,), dtype=np.int32)
+
+            for i in range(Vocab.size):
+                bytes[i] = byte_length_of_token(tokenizer, i)
+
+            return hax.named(jnp.array(bytes), Vocab)
+
+
+class _EvalRunningMeans(eqx.Module):
+    token_avg_loss: RunningMean  # average loss averaged over all tokens
+    loss_per_tag: RunningMean  # average loss per tag
+    bpb: RunningMean  # bits per byte averaged over all tokens
+    bpb_per_tag: RunningMean  # bits per byte per tag
+
+    @staticmethod
+    def zeros_like(total: Arrayish, per_tag: Arrayish) -> "_EvalRunningMeans":
+        z = RunningMean.zeros_like(total)
+        per_tag = RunningMean.zeros_like(per_tag)
+        return _EvalRunningMeans(z, per_tag, z, per_tag)
diff --git a/src/levanter/main/train_lm.py b/src/levanter/main/train_lm.py
index 8e905b064..6c96f8b62 100644
--- a/src/levanter/main/train_lm.py
+++ b/src/levanter/main/train_lm.py
@@ -174,6 +174,7 @@ def main(config: TrainLmConfig):
             cb = levanter.eval.cb_tagged_lm_evaluate(
                 EvalBatch,
                 causal_datasets,
+                tokenizer,
                 trainer.device_mesh,
                 compute_axis_mapping,
                 max_eval_examples_per_ds,
diff --git a/src/levanter/utils/hf_utils.py b/src/levanter/utils/hf_utils.py
index e5a576236..922de4830 100644
--- a/src/levanter/utils/hf_utils.py
+++ b/src/levanter/utils/hf_utils.py
@@ -1,4 +1,8 @@
 import os
+import re
+from typing import TypeAlias
+
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from levanter.logging import silence_transformer_nag
 from levanter.utils.py_utils import logical_cpu_core_count
@@ -8,6 +12,8 @@
 
 _HF_TOKENIZER_OFF_VALUES = {"off", "false", "f", "no", "n", "0"}
 
+HfTokenizer: TypeAlias = PreTrainedTokenizerFast | PreTrainedTokenizer
+
 
 def num_cpus_used_by_tokenizer(tokenizer) -> int:
     if getattr(tokenizer, "is_fast", False):
@@ -20,3 +26,27 @@ def num_cpus_used_by_tokenizer(tokenizer) -> int:
             return min(max(1, logical_cpu_core_count() - 2), 12)
     else:
         return 1
+
+
+def byte_length_of_token(tokenizer, idx: int) -> int:
+    # this is a pain because we want the prefix spaces, but we don't want extra noise for bytes
+    # e.g. in llama
+    # >>> t.convert_ids_to_tokens(q[2])
+    # '▁this'
+    # >>> t.convert_ids_to_tokens(25)
+    # '<0x16>'
+    # We want the _ (as a single byte, not the 3 it's encoded as) but not the <0x16>, which should instead be a single byte \x16
+    # decode strips the prefix spaces, but does correctly handle the <0x16> case
+    # we can avoid prefix space issues by prepending another token before decoding, then stripping
+    repr = tokenizer.convert_ids_to_tokens(idx)
+    if idx in tokenizer.all_special_ids:
+        # NB: special tokens don't have bytes, but they contribute to perplexity/bits
+        return 0
+    # handle bytes specially. This is a bit of a hack, but there's no other way
+    elif m := re.match(r"<0x([0-9A-Fa-f]+)>", repr):
+        return len(bytes.fromhex(m.group(1)))
+    else:
+        extra_token = tokenizer(".", add_special_tokens=False)["input_ids"][0]
+        excess_bytes = len(".".encode("utf-8"))
+        decoded = tokenizer.decode([extra_token, idx]).encode("utf-8")
+        return len(decoded) - excess_bytes
diff --git a/src/levanter/utils/stat_utils.py b/src/levanter/utils/stat_utils.py
index e51918d2f..6111be42e 100644
--- a/src/levanter/utils/stat_utils.py
+++ b/src/levanter/utils/stat_utils.py
@@ -7,7 +7,7 @@
 import haliax as hax
 
 
-Arrayish: typing.TypeAlias = hax.NamedArray | np.ndarray | jnp.ndarray | float
+Arrayish: typing.TypeAlias = hax.NamedArray | np.ndarray | jnp.ndarray
 
 
 class RunningMean(eqx.Module):
diff --git a/src/levanter/utils/thread_utils.py b/src/levanter/utils/thread_utils.py
index 0b4abcdaf..fad60ad31 100644
--- a/src/levanter/utils/thread_utils.py
+++ b/src/levanter/utils/thread_utils.py
@@ -34,24 +34,41 @@ class AsyncIteratorWrapper(Iterator):
     def __init__(self, async_iter):
         self.async_iter = async_iter
         self.loop = asyncio.new_event_loop()
-        self.executor = ThreadPoolExecutor(max_workers=1)
         self.thread = threading.Thread(target=self._run_loop, daemon=True)
         self.thread.start()
+        self._exhausted = False  # Flag to indicate if the iterator is exhausted
 
     def _run_loop(self):
         asyncio.set_event_loop(self.loop)
         self.loop.run_forever()
 
     def _run_async_task(self, coro):
-        return asyncio.run_coroutine_threadsafe(coro, self.loop).result()
+        if not self.loop.is_running() or not self.thread.is_alive():
+            raise StopIteration
+        try:
+            future = asyncio.run_coroutine_threadsafe(coro, self.loop)
+            return future.result()
+        except (RuntimeError, asyncio.CancelledError):
+            raise StopIteration
 
     def __iter__(self):
         return self
 
     def __next__(self):
+        if self._exhausted:
+            raise StopIteration
         try:
             return self._run_async_task(self.async_iter.__anext__())
         except StopAsyncIteration:
-            self.loop.call_soon_threadsafe(self.loop.stop)
+            self._exhausted = True  # Mark the iterator as exhausted
+            if self.loop.is_running():
+                self.loop.call_soon_threadsafe(self.loop.stop)
             self.thread.join()
             raise StopIteration
+
+    def close(self):
+        """Close the event loop and thread gracefully."""
+        if self.loop.is_running():
+            self.loop.call_soon_threadsafe(self.loop.stop)
+        self.thread.join()
+        self.loop.close()
diff --git a/tests/test_hf_utils.py b/tests/test_hf_utils.py
index e6a6158e2..c3c322cf0 100644
--- a/tests/test_hf_utils.py
+++ b/tests/test_hf_utils.py
@@ -3,6 +3,8 @@
 from fsspec import AbstractFileSystem
 
 from levanter.compat.hf_checkpoints import load_tokenizer
+from levanter.utils.hf_utils import byte_length_of_token
+from test_utils import skip_if_hf_model_not_accessible
 
 
 def test_load_tokenizer_in_memory_fs():
@@ -22,3 +24,66 @@ def test_load_tokenizer_in_memory_fs():
         )
     tokenizer = load_tokenizer("memory://foo/")
     assert len(tokenizer) == 5027
+
+
+@skip_if_hf_model_not_accessible("meta-llama/Llama-2-7b-hf")
+def test_byte_length_of_token():
+    tok = load_tokenizer("meta-llama/Llama-2-7b-hf")
+    ids = tok("this is hello a test", add_special_tokens=False)["input_ids"]
+    assert byte_length_of_token(tok, ids[2]) == len(" hello".encode("utf-8"))
+    assert byte_length_of_token(tok, 25) == 1
+    # llama prepends a space to the string. ideally it wouldn't b/c it technically throws off our bpb calculations
+    # but it's a small difference
+    assert byte_length_of_token(tok, ids[0]) == len(" this".encode("utf-8"))
+
+    bos = tok.bos_token_id
+    assert byte_length_of_token(tok, bos) == 0
+
+    # 632: "▁▁▁▁▁▁▁▁▁▁▁▁" which is just 12 spaces
+    # assert byte_length_of_token(tok, 632) == len("            ".encode("utf-8"))
+    # 8535: "ными"
+    # assert byte_length_of_token(tok, 8535) == len("ными".encode("utf-8"))
+
+    checks = {
+        632: " " * 12,
+        8535: "ными",
+        25: " ",
+    }
+
+    for token_id, expected_length in checks.items():
+        assert byte_length_of_token(tok, token_id) == len(expected_length.encode("utf-8"))
+
+    # now just test all tokens and print the ones that aren't expected
+    # the ones less than 259 are bytes or special tokens
+    for i in range(3, 259):
+        byte_length = byte_length_of_token(tok, i)
+        assert byte_length == 1, f"Token {i} has length {byte_length} but expected 1"
+
+    for i in range(259, tok.vocab_size):
+        byte_length = byte_length_of_token(tok, i)
+        expected_length = len(tok.convert_ids_to_tokens(i).replace("▁", " ").encode("utf-8"))
+        assert byte_length == expected_length, f"Token {i} has length {byte_length} but expected {expected_length}"
+
+
+@skip_if_hf_model_not_accessible("meta-llama/Llama-2-7b-hf")
+def test_byte_length_of_token_multi():
+    tok = load_tokenizer("meta-llama/Llama-2-7b-hf")
+    multi_checks = [
+        "👍你好",
+    ]
+
+    for expr in multi_checks:
+        # stupid llama adds a prefix space
+        token_ids = tok.encode(expr, add_special_tokens=False)[1:]
+        total_length = sum(byte_length_of_token(tok, token_id) for token_id in token_ids)
+        assert total_length == len(expr.encode("utf-8"))
+
+
+@skip_if_hf_model_not_accessible("gpt2")
+def test_byte_length_of_token_gpt2():
+    tok = load_tokenizer("gpt2")
+    ids = tok("this is hello a test", add_special_tokens=False)["input_ids"]
+    assert byte_length_of_token(tok, ids[2]) == len(" hello".encode("utf-8"))
+
+    eos = tok.eos_token_id
+    assert byte_length_of_token(tok, eos) == 0
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 1bf03b624..6206ec2ff 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -11,7 +11,7 @@
 from equinox import nn as nn
 from equinox import static_field
 from jax._src.random import PRNGKey
-from transformers import BatchEncoding
+from transformers import AutoConfig, BatchEncoding
 
 import haliax as hax
 
@@ -171,9 +171,7 @@ def try_load_path(path):
 def skip_if_hf_model_not_accessible(model_id: str):
     def try_load_hf(model_id):
         try:
-            from transformers import AutoModel
-
-            AutoModel.from_pretrained(model_id)
+            AutoConfig.from_pretrained(model_id)
         except Exception:
             return False
         else:
diff --git a/tests/tiny_test_corpus.py b/tests/tiny_test_corpus.py
index 91597c137..fb09f362a 100644
--- a/tests/tiny_test_corpus.py
+++ b/tests/tiny_test_corpus.py
@@ -69,7 +69,7 @@ def construct_small_data_cache(
         validation_urls=[f"file://{path}/validation/docs.jsonl"],
         cache_dir=f"{path}/cache",
         vocab_size=vocab_size,
-        tokenizer="passthrough",
+        tokenizer="gpt2",
     )
 
     return config, caches

From fe3e2f3f47cc7cfd92ad65b393468bbe80bad1da Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Sun, 22 Sep 2024 16:05:43 -0700
Subject: [PATCH 62/94] fix sequence parallel attention in splash attention
 (#738)

* fix sequence parallel attention in splash attention

* revert head change
---
 config/gpt2_small_fast.yaml      |  3 ++
 src/levanter/models/attention.py | 47 ++++++++++++++++++--------------
 2 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/config/gpt2_small_fast.yaml b/config/gpt2_small_fast.yaml
index 6242a37bc..054977899 100644
--- a/config/gpt2_small_fast.yaml
+++ b/config/gpt2_small_fast.yaml
@@ -19,6 +19,9 @@ trainer:
 
   train_batch_size: 256
   num_train_steps: 20000
+
+#  tensor_parallel_axes: ["position", "key_position"]
+#  tensor_parallel_axes: ["heads", "mlp"]
 optimizer:
   learning_rate: 1E-3
   weight_decay: 0.1
diff --git a/src/levanter/models/attention.py b/src/levanter/models/attention.py
index e7c94f50b..633feee68 100644
--- a/src/levanter/models/attention.py
+++ b/src/levanter/models/attention.py
@@ -836,6 +836,10 @@ def flatten(axes):
         check_rep=False,
     )
     def wrap_flash_attention(q, k, v):
+        # NB: inside the function, q, k, and v are partitioned, so in general the lengths of dims are not the same
+        Sq = q.shape[2]
+        Sk = k.shape[2]
+        Hq = q.shape[1]
         block_sizes = splash_attention_kernel.BlockSizes(
             block_q=min(block_size, Sq),
             block_kv_compute=min(block_size, Sk),
@@ -848,14 +852,14 @@ def wrap_flash_attention(q, k, v):
         )
 
         if mask is None:
-            kernel_mask = splash_attention_mask.FullMask(_shape=(Sq, Sk))
+            base_mask = splash_attention_mask.FullMask(_shape=(Sq, Sk))
         elif isinstance(mask, AttentionMask):
             if mask.is_causal:
-                masks = [splash_attention_mask.CausalMask(shape=(Sq, Sq)) for i in range(Hq)]
-                kernel_mask = splash_attention_mask.MultiHeadMask(masks=masks)
+                base_mask = splash_attention_mask.CausalMask(shape=(Sq, Sk))
             else:
-                kernel_mask = splash_attention_mask.FullMask(_shape=(Sq, Sk))
+                base_mask = splash_attention_mask.FullMask(_shape=(Sq, Sk))
 
+            # This is going to be a pain to support
             if mask.explicit_mask is not None:
                 raise NotImplementedError("Explicit masks are not yet supported for splash attention")
         elif isinstance(mask, NamedArray):
@@ -863,6 +867,8 @@ def wrap_flash_attention(q, k, v):
         else:
             raise ValueError(f"Unknown mask type: {mask}")
 
+        kernel_mask = splash_attention_mask.MultiHeadMask(masks=[base_mask for _ in range(Hq)])
+
         # copied from MaxText
         splash_kernel = splash_attention_kernel.make_splash_mha(
             mask=kernel_mask, head_shards=1, q_seq_shards=1, block_sizes=block_sizes
@@ -879,22 +885,23 @@ def wrap_flash_attention(q, k, v):
     # the output shape is B, S_q, H_q, D_v. Right now we're requiring D_k == D_v
     # we can reshape it to match our expected output
     attn_output = _unflatten_bshd(attn_output, q_class, v_class)
-    reference_out_shape = eqx.filter_eval_shape(
-        simple_attention_with_dropout,
-        QPos,
-        KPos,
-        Key,
-        query,
-        key,
-        value,
-        mask,
-        bias,
-        inference,
-        dropout,
-        attention_dtype,
-        precision,
-        prng=prng,
-    )
+    with haliax.axis_mapping({}):
+        reference_out_shape = eqx.filter_eval_shape(
+            simple_attention_with_dropout,
+            QPos,
+            KPos,
+            Key,
+            query,
+            key,
+            value,
+            mask,
+            bias,
+            inference,
+            dropout,
+            attention_dtype,
+            precision,
+            prng=prng,
+        )
     attn_output = attn_output.rearrange(reference_out_shape.axes).astype(reference_out_shape.dtype)
 
     attn_output = haliax.shard(attn_output)

From 9fa3aaa4096e6843d18b901b9e3e73adf3fa814e Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Tue, 24 Sep 2024 00:48:36 -0700
Subject: [PATCH 63/94] fix llama 3 rotary embeddings (#740)

---
 src/levanter/models/gemma.py  |   9 +-
 src/levanter/models/llama.py  |  75 +++-----------
 src/levanter/models/rotary.py | 182 ++++++++++++++++++++++++++++++++++
 tests/test_llama.py           |  18 ++--
 tests/test_llama3.py          |  36 ++++++-
 5 files changed, 246 insertions(+), 74 deletions(-)
 create mode 100644 src/levanter/models/rotary.py

diff --git a/src/levanter/models/gemma.py b/src/levanter/models/gemma.py
index 1f8396b20..af5cc44be 100644
--- a/src/levanter/models/gemma.py
+++ b/src/levanter/models/gemma.py
@@ -28,6 +28,7 @@
     LlamaMlp,
 )
 from levanter.models.lm_model import LmConfig, LmHeadModel
+from levanter.models.rotary import DefaultRotaryEmbeddingsConfig, RotaryEmbeddingsConfig
 from levanter.types import BlockFoldable
 from levanter.utils.flop_utils import lm_flops_per_token
 
@@ -80,7 +81,6 @@ class GemmaConfig(HFCompatConfig):
     attn_dropout = 0.0
     norm_eps = 1e-6
 
-    rope_base: int = 10_000
     norm_embeddings: bool = True
 
     # Attention-related config
@@ -94,9 +94,12 @@ class GemmaConfig(HFCompatConfig):
     scan_layers: bool = True
 
     use_bias: bool = False
-    rope_scaling: Optional[dict] = None
     rope_theta: float = 10000.0
 
+    @property
+    def rope(self) -> RotaryEmbeddingsConfig:
+        return DefaultRotaryEmbeddingsConfig(theta=self.rope_theta)
+
     # Axis
     Pos = property(lambda self: Axis(name="position", size=self.seq_len))
     KeyPos = property(lambda self: self.Pos.alias("key_position"))
@@ -146,7 +149,7 @@ def from_hf_config(cls, hf_config: HfConfig):
             num_kv_heads=hf_config.num_key_value_heads,
             initializer_range=hf_config.initializer_range,
             layer_norm_epsilon=hf_config.rms_norm_eps,
-            rope_base=hf_config.rope_theta,
+            rope_theta=hf_config.rope_theta,
         )
 
     def to_hf_config(self, vocab_size: int, config_overrides: Optional[Dict] = None) -> HfGemmaConfig:
diff --git a/src/levanter/models/llama.py b/src/levanter/models/llama.py
index 2a2d2664d..e777b7636 100644
--- a/src/levanter/models/llama.py
+++ b/src/levanter/models/llama.py
@@ -1,9 +1,8 @@
 import dataclasses
 from dataclasses import dataclass
-from typing import Callable, Dict, Optional, Tuple, Type, Union
+from typing import Callable, Dict, Optional, Type, Union
 
 import equinox as eqx
-import jax
 import jax.numpy as jnp
 import jax.random as jrandom
 from jaxtyping import PRNGKeyArray
@@ -28,6 +27,7 @@
 from levanter.models.attention import AttentionBackend, AttentionMask, dot_product_attention
 from levanter.models.gpt2 import ACT2FN
 from levanter.models.lm_model import LmConfig, LmHeadModel
+from levanter.models.rotary import DefaultRotaryEmbeddingsConfig, RotaryEmbeddingsConfig
 from levanter.types import BlockFoldable
 from levanter.utils.flop_utils import lm_flops_per_token
 
@@ -77,8 +77,7 @@ class LlamaConfig(HFCompatConfig):
 
     use_bias: bool = False
     use_layer_norm_weight: bool = True
-    rope_scaling: Optional[dict] = None
-    rope_theta: float = 10000.0
+    rope: RotaryEmbeddingsConfig = dataclasses.field(default_factory=DefaultRotaryEmbeddingsConfig)
 
     reference_checkpoint: str = "meta-llama/Llama-2-7b-hf"
     tokenizer: Optional[str] = None
@@ -109,6 +108,8 @@ def hf_checkpoint_converter(self) -> HFCheckpointConverter["LlamaConfig"]:  # ty
 
     @classmethod
     def from_hf_config(cls, hf_config: HfConfig):
+        rope_theta = hf_config.rope_theta
+        rope_config = RotaryEmbeddingsConfig.from_hf_config(rope_theta, hf_config.rope_scaling)
         return LlamaConfig(
             seq_len=hf_config.max_position_embeddings,
             hidden_dim=hf_config.hidden_size,
@@ -119,8 +120,7 @@ def from_hf_config(cls, hf_config: HfConfig):
             activation_function=hf_config.hidden_act,
             initializer_range=hf_config.initializer_range,
             layer_norm_epsilon=hf_config.rms_norm_eps,
-            rope_scaling=hf_config.rope_scaling,
-            rope_theta=hf_config.rope_theta,
+            rope=rope_config,
         )
 
     def to_hf_config(self, vocab_size: int, config_overrides: Optional[Dict] = None) -> HfLlamaConfig:
@@ -136,6 +136,8 @@ def to_hf_config(self, vocab_size: int, config_overrides: Optional[Dict] = None)
         if config_overrides is None:
             config_overrides = {}
 
+        rope_theta, rope_scaling = self.rope.to_hf_config()
+
         return HfLlamaConfig(
             max_position_embeddings=self.seq_len,
             hidden_size=self.hidden_dim,
@@ -146,9 +148,10 @@ def to_hf_config(self, vocab_size: int, config_overrides: Optional[Dict] = None)
             hidden_act=self.activation_function,
             initializer_range=self.initializer_range,
             rms_norm_eps=self.layer_norm_epsilon,
-            rope_scaling=self.rope_scaling,
+            # rope_scaling=self.rope_scaling,
             vocab_size=vocab_size,
-            rope_theta=self.rope_theta,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
             **config_overrides,
         )
 
@@ -274,13 +277,6 @@ def init(config: LlamaConfig, *, key) -> "LlamaAttention":
         )
         return LlamaAttention(config, q_proj, k_proj, v_proj, o_proj)
 
-    def _rope_scale_factor(self) -> float:
-        # hasattr for gemma and I'm feeling lazy
-        if hasattr(self.config, "rope_scaling") and self.config.rope_scaling is not None:
-            assert self.config.rope_scaling["type"] == "linear"
-            return self.config.rope_scaling["factor"]
-        return 1.0
-
     @named_call
     def __call__(self, x: NamedArray, mask: Optional[NamedArray | AttentionMask], *, key=None) -> NamedArray:
         key_q, key_k, key_v, key_o = maybe_rng_split(key, 4)
@@ -290,13 +286,8 @@ def __call__(self, x: NamedArray, mask: Optional[NamedArray | AttentionMask], *,
         k = self.k_proj(x, key=key_k).rearrange((..., "kv_heads", "position", "head_size"))
         v = self.v_proj(x, key=key_v).rearrange((..., "kv_heads", "position", "head_size"))
 
-        cos, sin = llama_rotary_pos_emb(
-            self.config.HeadSize,
-            x.resolve_axis("position"),
-            scale=self._rope_scale_factor(),
-            theta=self.config.rope_theta,
-        )
-        q, k = _apply_rotary_pos_emb(q, k, cos, sin)
+        rot_embs = self.config.rope.build(self.config.HeadSize, q.resolve_axis("position"))
+        q, k = rot_embs(self.config.HeadSize, q, k)
 
         k = k.rename({"position": "key_position"})
         v = v.rename({"position": "key_position"})
@@ -588,43 +579,3 @@ def update_state_dict(self, state_dict: StateDict, prefix: Optional[str] = None)
 
         state_dict.update(my_dict)
         return state_dict
-
-
-def _rotate_half(x: NamedArray) -> NamedArray:
-    """Rotates half of the hidden dims of the input and concatenates them."""
-    HeadSize = x.axes[-1]
-    x1 = x[HeadSize, : HeadSize.size // 2]
-    x2 = x[HeadSize, HeadSize.size // 2 :]
-    out = hax.concatenate(HeadSize, (-x2, x1))
-    return out
-
-
-def _apply_rotary_pos_emb(
-    q: NamedArray,  # [batch, position, kv_heads, q_heads_per_group, head_size]
-    k: NamedArray,  # [batch, position, kv_heads, head_size]
-    cos: NamedArray,  # [position, head_size]
-    sin: NamedArray,  # [position, head_size]
-) -> Tuple[NamedArray, NamedArray]:
-    """Applies rotary position embedding to q and k."""
-    q_embed = q * cos + _rotate_half(q) * sin
-    k_embed = k * cos + _rotate_half(k) * sin
-    return q_embed, k_embed
-
-
-def llama_rotary_pos_emb(
-    HeadSize: Axis, Pos: Axis, theta: float = 10000, scale: float = 1.0
-) -> Tuple[NamedArray, NamedArray]:
-    with jax.ensure_compile_time_eval():
-        HeadHalfSize = HeadSize.resize(HeadSize.size // 2)
-        inv_freq: NamedArray = 1.0 / (theta ** (hax.arange(HeadHalfSize, step=2) / HeadSize.size))
-
-        position_ids: NamedArray = hax.arange(Pos) / scale
-
-        freqs = position_ids * inv_freq.broadcast_axis(Pos)
-        # This is different from the paper but aligns with HF implementation:
-        # It uses a different permutation in order to obtain the same calculation
-        emb = hax.concatenate(HeadSize, (freqs, freqs))
-        cos = hax.cos(emb)
-        sin = hax.sin(emb)
-        # This is different from the paper but aligns with HF implementation:
-        return cos, sin
diff --git a/src/levanter/models/rotary.py b/src/levanter/models/rotary.py
new file mode 100644
index 000000000..07657e5ff
--- /dev/null
+++ b/src/levanter/models/rotary.py
@@ -0,0 +1,182 @@
+import abc
+from dataclasses import dataclass
+from typing import Tuple
+
+import draccus
+import equinox as eqx
+import jax
+import jax.numpy as jnp
+
+import haliax as hax
+from haliax import Axis, NamedArray
+
+
+def _rotate_half(x: NamedArray, HeadSize: Axis) -> NamedArray:
+    """Rotates half of the hidden dims of the input and concatenates them."""
+    x1 = x[HeadSize, : HeadSize.size // 2]
+    x2 = x[HeadSize, HeadSize.size // 2 :]
+    out = hax.concatenate(HeadSize, (-x2, x1))
+    return out
+
+
+class RotaryEmbeddings(eqx.Module):
+    cos: NamedArray
+    sin: NamedArray
+
+    @property
+    def nograd_cos(self):
+        return jax.lax.stop_gradient(self.cos)
+
+    @property
+    def nograd_sin(self):
+        return jax.lax.stop_gradient(self.sin)
+
+    def __call__(self, HeadDim: Axis, q: NamedArray, k: NamedArray) -> tuple[NamedArray, NamedArray]:
+        q_embed = q * self.nograd_cos + _rotate_half(q, HeadDim) * self.nograd_sin
+        k_embed = k * self.nograd_cos + _rotate_half(k, HeadDim) * self.nograd_sin
+        return q_embed, k_embed
+
+
+@dataclass
+class RotaryEmbeddingsConfig(abc.ABC, draccus.ChoiceRegistry):
+    @abc.abstractmethod
+    def build(self, HeadSize: Axis, Pos: Axis) -> RotaryEmbeddings:
+        pass
+
+    @staticmethod
+    def from_hf_config(rope_theta, config: dict | None) -> "RotaryEmbeddingsConfig":
+        if config is None:
+            return DefaultRotaryEmbeddingsConfig(theta=rope_theta)
+        tpe = config.get("rope_type") or config.get("type") or "default"
+        return RotaryEmbeddingsConfig.get_choice_class(tpe).make_from_hf_config(rope_theta, config)
+
+    @classmethod
+    @abc.abstractmethod
+    def make_from_hf_config(cls, rope_theta: float, config: dict) -> "RotaryEmbeddingsConfig":
+        pass
+
+    @abc.abstractmethod
+    def to_hf_config(self) -> tuple[float, dict | None]:
+        """Returns the rope_theta and config dict for the HF config."""
+        pass
+
+
+@dataclass
+class DefaultRotaryEmbeddingsConfig(RotaryEmbeddingsConfig):
+    theta: float = 10000
+    factor: float = 1.0  # this should have been called scale_factor, but for hf compat
+
+    def build(self, HeadSize: Axis, Pos: Axis) -> RotaryEmbeddings:
+        with jax.ensure_compile_time_eval():
+            HeadHalfSize = HeadSize.resize(HeadSize.size // 2)
+            inv_freq: NamedArray = 1.0 / (self.theta ** (hax.arange(HeadHalfSize, step=2) / HeadSize.size))
+            inv_freq = inv_freq / self.factor
+
+            position_ids: NamedArray = hax.arange(Pos)
+
+            freqs = position_ids * inv_freq.broadcast_axis(Pos)
+            emb = hax.concatenate(HeadSize, (freqs, freqs))
+            cos = hax.cos(emb)
+            sin = hax.sin(emb)
+            return RotaryEmbeddings(cos=cos, sin=sin)
+
+    @classmethod
+    def make_from_hf_config(cls, rope_theta: float, config: dict) -> "RotaryEmbeddingsConfig":
+        return DefaultRotaryEmbeddingsConfig(theta=rope_theta, factor=config.get("factor", 1.0))
+
+    def to_hf_config(self) -> tuple[float, dict | None]:
+        if self.factor == 1.0:
+            return self.theta, None
+        return self.theta, {"factor": self.factor}
+
+
+RotaryEmbeddingsConfig.register_subclass("default", DefaultRotaryEmbeddingsConfig)
+RotaryEmbeddingsConfig.register_subclass("linear", DefaultRotaryEmbeddingsConfig)
+
+
+@dataclass
+class Llama3RotaryEmbeddingsConfig(RotaryEmbeddingsConfig):
+    """
+    To match this from HF:
+                "rope_scaling": {
+                "factor": 8.0,
+                "low_freq_factor": 1.0,
+                "high_freq_factor": 4.0,
+                "original_max_position_embeddings": 8192,
+                "rope_type": "llama3"
+              },
+    """
+
+    theta: float = 500000
+    factor: float = 8.0
+    low_freq_factor: float = 1.0
+    high_freq_factor: float = 4.0
+    original_max_position_embeddings: int = 8192
+
+    def build(self, HeadSize: Axis, Pos: Axis) -> RotaryEmbeddings:
+        # https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_rope_utils.py#L307
+        # Porting that to JAX/Haliax:
+        with jax.ensure_compile_time_eval():
+            HeadHalfSize = HeadSize.resize(HeadSize.size // 2)
+            inv_freq: NamedArray = 1.0 / (self.theta ** (hax.arange(HeadHalfSize, step=2) / HeadSize.size))
+
+            old_context_len = self.original_max_position_embeddings
+            low_freq_wavelen = old_context_len / self.low_freq_factor
+            high_freq_wavelen = old_context_len / self.high_freq_factor
+
+            wavelen = 2 * jnp.pi / inv_freq
+            inv_freq_llama = hax.where(wavelen > low_freq_wavelen, inv_freq / self.factor, inv_freq)
+            smooth_factor = (old_context_len / wavelen - self.low_freq_factor) / (
+                self.high_freq_factor - self.low_freq_factor
+            )
+            smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / self.factor + smooth_factor * inv_freq_llama
+            is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
+            inv_freq_llama = hax.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+
+            position_ids: NamedArray = hax.arange(Pos)
+
+            freqs = position_ids * inv_freq_llama.broadcast_axis(Pos)
+            emb = hax.concatenate(HeadSize, (freqs, freqs))
+            cos = hax.cos(emb)
+            sin = hax.sin(emb)
+            return RotaryEmbeddings(cos=cos, sin=sin)
+
+    @classmethod
+    def make_from_hf_config(cls, rope_theta: float, config: dict) -> "RotaryEmbeddingsConfig":
+        return Llama3RotaryEmbeddingsConfig(
+            theta=rope_theta,
+            factor=config.get("factor", 8.0),
+            low_freq_factor=config.get("low_freq_factor", 1.0),
+            high_freq_factor=config.get("high_freq_factor", 4.0),
+            original_max_position_embeddings=config.get("original_max_position_embeddings", 8192),
+        )
+
+    def to_hf_config(self) -> tuple[float, dict]:
+        return self.theta, {
+            "factor": self.factor,
+            "low_freq_factor": self.low_freq_factor,
+            "high_freq_factor": self.high_freq_factor,
+            "original_max_position_embeddings": self.original_max_position_embeddings,
+        }
+
+
+RotaryEmbeddingsConfig.register_subclass("llama3", Llama3RotaryEmbeddingsConfig)
+
+
+def rotary_pos_emb(
+    HeadSize: Axis, Pos: Axis, theta: float = 10000, scale: float = 1.0
+) -> Tuple[NamedArray, NamedArray]:
+    with jax.ensure_compile_time_eval():
+        HeadHalfSize = HeadSize.resize(HeadSize.size // 2)
+        inv_freq: NamedArray = 1.0 / (theta ** (hax.arange(HeadHalfSize, step=2) / HeadSize.size)) / scale
+
+        position_ids: NamedArray = hax.arange(Pos)
+
+        freqs = position_ids * inv_freq.broadcast_axis(Pos)
+        # This is different from the paper but aligns with HF implementation:
+        # It uses a different permutation in order to obtain the same calculation
+        emb = hax.concatenate(HeadSize, (freqs, freqs))
+        cos = hax.cos(emb)
+        sin = hax.sin(emb)
+        # This is different from the paper but aligns with HF implementation:
+        return cos, sin
diff --git a/tests/test_llama.py b/tests/test_llama.py
index 4277150fe..2d2b6506f 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -12,9 +12,8 @@
 
 from levanter.models.attention import AttentionMask
 from levanter.models.llama import LlamaAttention, LlamaConfig, LlamaDecoderLayer, LlamaLMHeadModel, LlamaRMSNorm
-from levanter.models.llama import _apply_rotary_pos_emb as levanter_apply_rotary_pos_emb
-from levanter.models.llama import _rotate_half as levanter_rotate_half
-from levanter.models.llama import llama_rotary_pos_emb
+from levanter.models.rotary import DefaultRotaryEmbeddingsConfig, RotaryEmbeddings
+from levanter.models.rotary import _rotate_half as levanter_rotate_half
 from levanter.utils.jax_utils import parameter_count
 from test_utils import check_load_config, check_model_works_with_seqlen, parameterize_with_configs, skip_if_no_torch
 
@@ -71,7 +70,9 @@ def test_llama_rotary_embedding():
     x = random.normal(key, (1, seq_len))
     x_torch = torch.from_numpy(np.array(x))
 
-    levanter_output = llama_rotary_pos_emb(HeadSize=HeadSize, Pos=Pos)
+    levanter_emb = DefaultRotaryEmbeddingsConfig().build(HeadSize=HeadSize, Pos=Pos)
+    levanter_output = (levanter_emb.cos, levanter_emb.sin)
+
     hf_rope = HFLlamaRotaryEmbedding(dim=hidden_dim, max_position_embeddings=seq_len, device=device)
     hf_output = hf_rope(x_torch, torch.arange(seq_len).reshape(1, -1))
 
@@ -106,8 +107,8 @@ def named_array_to_tensor(named_array):
     k = hax.random.normal(random.PRNGKey(1), (Batch, Pos, Heads, HeadSize))
 
     # Check the output of _rotate_half() from levanter and hf
-    levanter_out_rf_q = levanter_rotate_half(q)
-    levanter_out_rf_k = levanter_rotate_half(k)
+    levanter_out_rf_q = levanter_rotate_half(q, HeadSize)
+    levanter_out_rf_k = levanter_rotate_half(k, HeadSize)
 
     q_tensor = named_array_to_tensor(q).transpose(1, 2)  # needed for HF
     k_tensor = named_array_to_tensor(k).transpose(1, 2)
@@ -121,7 +122,9 @@ def named_array_to_tensor(named_array):
     cos = hax.random.normal(random.PRNGKey(2), (Pos, HeadSize))
     sin = hax.random.normal(random.PRNGKey(3), (Pos, HeadSize))
 
-    levanter_out_rope_q, levanter_out_rope_k = levanter_apply_rotary_pos_emb(q, k, cos, sin)
+    rot = RotaryEmbeddings(cos=cos, sin=sin)
+
+    levanter_out_rope_q, levanter_out_rope_k = rot(HeadSize, q, k)
     cos_tensor = named_array_to_tensor(cos)[None, :, :]
     sin_tensor = named_array_to_tensor(sin)[None, :, :]
 
@@ -328,7 +331,6 @@ def _get_llama_config(use_flash=False, num_kv_heads=4, seq_len=128) -> LlamaConf
         hidden_dim=16,
         num_heads=4,
         num_kv_heads=num_kv_heads,
-        rope_scaling=None,
         gradient_checkpointing=False,  # disable for tests so debugging is easier
         use_flash_attention=use_flash,
         flash_attention_block_size=8 if use_flash else None,
diff --git a/tests/test_llama3.py b/tests/test_llama3.py
index a6f1d67b8..2fae326d1 100644
--- a/tests/test_llama3.py
+++ b/tests/test_llama3.py
@@ -35,7 +35,13 @@ def get_config(vocab_size=1000):
             "num_key_value_heads": 8,
             "pretraining_tp": 1,
             "rms_norm_eps": 0.00001,
-            "rope_scaling": null,
+            "rope_scaling": {
+                "factor": 8.0,
+                "low_freq_factor": 1.0,
+                "high_freq_factor": 4.0,
+                "original_max_position_embeddings": 8192,
+                "rope_type": "llama3"
+              },
             "rope_theta": 500000,
             "tie_word_embeddings": false,
             "torch_dtype": "bfloat16",
@@ -110,3 +116,31 @@ def compute(model, input):
         torch_out2 = torch_out2.logits[0].detach().cpu().numpy()
         assert torch_out2.shape == jax_out.shape, f"{torch_out2.shape} != {jax_out.shape}"
         np.testing.assert_allclose(torch_out2, jax_out, rtol=1e-5, atol=1e-5)
+
+
+@skip_if_no_torch
+def test_llama3_rotary_embedding():
+    import torch
+    from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding as HFLlamaRotaryEmbedding
+
+    llama_config = get_config()
+    key = random.PRNGKey(0)
+    device = "cpu"
+
+    lev_config = LlamaConfig.from_hf_config(llama_config)
+    HeadSize = lev_config.HeadSize
+    Pos = lev_config.Pos
+    seq_len = Pos.size
+
+    x = random.normal(key, (1, seq_len))
+    x_torch = torch.from_numpy(np.array(x))
+
+    levanter_emb = lev_config.rope.build(HeadSize, Pos)
+    levanter_output = (levanter_emb.cos, levanter_emb.sin)
+
+    hf_rope = HFLlamaRotaryEmbedding(max_position_embeddings=seq_len, device=device, config=llama_config)
+    hf_output = hf_rope(x_torch, torch.arange(seq_len).reshape(1, -1))
+
+    for jax_out, torch_out in zip(levanter_output, hf_output):
+        torch_out = torch_out.numpy()
+        assert np.isclose(torch_out, np.array(jax_out.array), rtol=1e-2, atol=1e-2).all(), f"{torch_out} != {jax_out}"

From 2b42bfbeb9362e86b49da4997a8ac17a50c9bf35 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Tue, 24 Sep 2024 13:56:54 -0700
Subject: [PATCH 64/94] Support for running in a Ray cluster (#737)

---
 .gitignore                           |   5 +-
 docker/tpu/Dockerfile.cluster        |  74 +++++++
 docs/Getting-Started-TPU-VM.md       | 304 +++++++++++++++++--------
 docs/design/Ray-Job-Manager.md       | 108 +++++++++
 infra/cluster/job-cluster.yaml       | 148 +++++++++++++
 infra/cluster/push_cluster_docker.sh |   1 +
 infra/launch.py                      |   6 +-
 infra/launch_on_ray.py               | 215 ++++++++++++++++++
 pyproject.toml                       |   6 +-
 src/levanter/distributed.py          |   7 +-
 src/levanter/infra/cli_helpers.py    |   8 +-
 src/levanter/infra/docker.py         |   9 +
 src/levanter/infra/ray_tpu.py        | 319 +++++++++++++++++++++++++++
 src/levanter/infra/tpus.py           |  85 ++++++-
 src/levanter/store/cache.py          |   1 -
 15 files changed, 1187 insertions(+), 109 deletions(-)
 create mode 100644 docker/tpu/Dockerfile.cluster
 create mode 100644 docs/design/Ray-Job-Manager.md
 create mode 100644 infra/cluster/job-cluster.yaml
 create mode 100644 infra/cluster/push_cluster_docker.sh
 create mode 100755 infra/launch_on_ray.py
 create mode 100644 src/levanter/infra/ray_tpu.py

diff --git a/.gitignore b/.gitignore
index 8a6acca53..9615f94ab 100644
--- a/.gitignore
+++ b/.gitignore
@@ -150,6 +150,9 @@ ledger.json
 /checkpoints
 *.jaxpr
 
-# local execution commands
 local_*.sh
+
+# aider
 .aider*
+
+.benchmarks
diff --git a/docker/tpu/Dockerfile.cluster b/docker/tpu/Dockerfile.cluster
new file mode 100644
index 000000000..69a109790
--- /dev/null
+++ b/docker/tpu/Dockerfile.cluster
@@ -0,0 +1,74 @@
+# This dockerfile is used to build the docker image for using Ray to manage TPU slices.
+ARG IMAGE=ghcr.io/stanford-crfm/levanter-base
+ARG TAG=latest
+
+FROM ${IMAGE}:${TAG}
+
+# install docker in docker, but don't start it
+RUN apt-get update && apt-get install -y docker.io
+
+ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\
+    TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024\
+    RAY_USAGE_STATS_ENABLED=0\
+    PATH=/opt/levanter/.venv/bin:$PATH\
+    PYTHONPATH=/opt/levanter:/opt/levanter/src:/opt/levanter/examples:/opt/levanter/tests:src:.\
+    HOME=/home/levanter
+# Install dependencies
+
+RUN apt-get install -y \
+        sudo \
+        git \
+        libjemalloc-dev \
+        wget \
+        cmake \
+        g++ \
+        zlib1g-dev \
+        tmux \
+        screen \
+        rsync \
+        netbase \
+        openssh-client \
+        gnupg
+
+RUN pip install --no-cache-dir \
+        flatbuffers \
+        cython==0.29.37 \
+        # Necessary for Dataset to work properly.
+        numpy\>=1.20 \
+        psutil \
+        # Required a recent version of setuptools to be compatible with python 3.12+.
+        setuptools==71.1.0 \
+          "google-api-python-client==1.7.8" \
+        "google-oauth"
+
+
+# Install gcloud so we can get secrets (maybe we should just curl?)
+RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz > /tmp/google-cloud-sdk.tar.gz
+
+RUN mkdir -p /usr/local/gcloud \
+  && tar -C /usr/local/gcloud -xvf /tmp/google-cloud-sdk.tar.gz \
+  && /usr/local/gcloud/google-cloud-sdk/install.sh \
+  && rm -f /tmp/google-cloud-sdk.tar.gz
+
+# Adding the package path to local
+ENV PATH=$PATH:/usr/local/gcloud/google-cloud-sdk/bin
+
+# GCP doesn't like it when root ssh's into a machine
+RUN useradd -m -s /bin/bash levanter
+RUN echo "levanter ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
+RUN usermod -aG docker levanter
+RUN mkdir -p $HOME && touch $HOME/.bashrc && chown -R levanter $HOME
+RUN echo "export PATH=$PATH" >> $HOME/.bashrc
+RUN adduser levanter docker
+
+RUN chown -R levanter /opt/levanter
+
+USER levanter
+
+# HACK until https://github.com/ray-project/ray/issues/47769 is resolved
+RUN pip install 'ray[default,gcp]==2.34.0'
+RUN git clone https://github.com/dlwh/ray.git ~/ray --branch tpu_docker_2.34 --depth 1
+RUN cp ~/ray/python/ray/autoscaler/_private/gcp/tpu_command_runner.py /opt/levanter/.venv/lib/python3.10/site-packages/ray/autoscaler/_private/gcp/tpu_command_runner.py
+
+
+WORKDIR /opt/levanter
diff --git a/docs/Getting-Started-TPU-VM.md b/docs/Getting-Started-TPU-VM.md
index 3bcb26092..d0728d1c1 100644
--- a/docs/Getting-Started-TPU-VM.md
+++ b/docs/Getting-Started-TPU-VM.md
@@ -2,7 +2,50 @@
 
 This guide will walk you through the steps to get started with Levanter on TPU VMs.
 
-## Google Cloud Setup
+## Overview
+
+An important thing to know about TPU VMs is that they are not a single machine (for more than a vX-8). Instead, they
+are a collection of workers that are all connected to the same TPU pod. Each worker manages a set of 8 TPUs.
+This means that you can't just run a single process on a TPU VM instance, you need to run a distributed process,
+and you can't just set up one machine, but a whole cluster. We have some scripts to help with this.
+
+Our approach is to use Docker to package up the code and run it on the workers. TPU VMs already have Docker installed,
+so we just need to build the image and run it. We use a combination of `gcloud` and `docker` to do this, and it's
+mostly wrapped up in a script called `launch.py`. For handling preemptible compute and other failures, we have a
+new script called `launch_on_ray.py` that uses Ray to automatically spin up TPUs, run jobs, and restart them if they fail.
+
+We also have a legacy script called `spin-up-vm.sh` that can be used to create a TPU VM instance without any of the Docker stuff.
+
+### Preemptible TPUs
+
+Since much of our compute is preemptible, we have to account for the fact that TPU VMs can be preempted at any time.
+Levanter is designed to be robust to this, but we still have to actually restart the job when it happens.
+We refer to this as "babysitting" the job. We have two options for "babysitting" training jobs.
+
+1. `launch_on_ray.py` is a new, experimental script that uses Ray to manage the job and restart it if it fails.
+   This script is still in development, but it seems to basically work.
+2. `launch.py` has a `--retries` flag that will automatically restart the job if it fails.  To use this,
+ `launch.py` must be running in foreground mode and must maintain a connection to the TPU VM instance.
+
+## Installation
+
+### Install Levanter
+
+First, you need to clone the Levanter repository and install the dependencies. You can do this with the following commands:
+
+```bash
+git clone https://github.com/stanford-crfm/levanter.git
+cd levanter
+pip install -e .
+```
+
+### Docker
+
+Docker is a tool that allows you to package up code and run it in a container. You should install Docker
+on your local machine. Here are some instructions for [installing Docker](https://docs.docker.com/engine/install/)
+if you don't already have it. If you're not planning on using `launch.py` or `launch_on_ray.py`, you don't need Docker.
+
+### Google Cloud setup
 
 First you need gcloud installed and configured. You can find instructions for that [here](https://cloud.google.com/sdk/docs/quickstarts)
 or if you're a conda person you can just run `conda install -c conda-forge google-cloud-sdk`.
@@ -27,72 +70,19 @@ find more information about that [here](https://cloud.google.com/docs/authentica
 Honestly, if you're working outside of a corp environment and not dealing with private data, I don't bother...
 
 You may also need to create an SSH key and add it to your Google Cloud account. Consider using
-[GCloud's guide on ssh keys](https://cloud.google.com/compute/docs/connect/add-ssh-keys#metadata) (or OS Login if you do that)
+[gcloud's guide on ssh keys](https://cloud.google.com/compute/docs/connect/add-ssh-keys#metadata) (or OS Login if you do that)
 to set up ssh keys and [using `ssh-agent`](https://kb.iu.edu/d/aeww) to make executing the SSH commands easier.
 
-## Creating a TPU VM Instance
-
-An important thing to know about TPU VMs is that they are not a single machine (for more than a v3-8). Instead, they
-are a collection of workers that are all connected to the same TPU pod. Each worker manages a set of 8 TPUs.
-This means that you can't just run a single process on a TPU VM instance, you need to run a distributed process,
-and you can't just set up one machine, but a whole cluster. We have some scripts to help with this.
-
-### Automatic Setup
-
-You can use `infra/spin-up-vm.sh` to create a TPU VM instance. In addition to creating the instance, it will set up
-the venv on each worker, and it will clone the repo to `~/levanter/`.
-
-**For Public Users**:
-
-```bash
-bash infra/spin-up-vm.sh <name> -z <zone> -t <type> -n <subnetwork> [--preemptible] [--use-alpha]
-```
-
-Defaults are:
-- `zone`: `us-east1-d`
-- `type`: `v3-32`
-- `subnetwork`: `default` (set to custom VPC subnet, useful for tpuv4 configs)
-- `preemptible`: `false`
-- `use-alpha`: `false` (mark `true` for tpuv4s in alpha zones like `us-central2`)
-
-**Notes**:
-
-* This uploads setup scripts via scp. If the ssh-key that you used for Google Cloud requires passphrase or your ssh key
-path is not `~/.ssh/google_compute_engine`, you will need to modify the script.
-* The command will spam you with a lot of output, sorry.
-* If you use a preemptible instance, you probably want to use the ["babysitting" script](#babysitting-script) to
-the VM. That's explained down below in the [Running Levanter GPT-2](#running-levanter-gpt-2) section.
-
-
-## Useful commands
-
-### SSHing into one TPU VM worker
-
-`gcloud compute tpus tpu-vm ssh $name   --zone us-east1-d --worker=0`
-
-### Running a command on all workers (in parallel)
-`gcloud compute tpus tpu-vm ssh $name   --zone us-east1-d --worker=all --command="echo hello"`
-
-### SCPing a file to all workers
-`gcloud compute tpus tpu-vm scp my_file $name:path/to/file --zone us-east1-d --worker=all`
-
-### SCPing a file to one worker
-`gcloud compute tpus tpu-vm scp my_file $name:path/to/file --zone us-east1-d --worker=0`
-
-### SCPing a file from one worker
-`gcloud compute tpus tpu-vm scp $name:path/to/file my_file --zone us-east1-d --worker=0`
-
-## Running Levanter GPT-2
-Now that you have a TPU VM instance, you can follow the [Getting Started](Getting-Started-Training.md) steps, but here are a few shortcuts:
+## Using `launch.py`
 
-### Launch a GPT-2 Small in unattended mode
+### Configuration
 
 You will need a [Docker installation](https://docs.docker.com/engine/install/)
 on your development machine to build and run images on TPUs.
 
 First create a configuration file for future launches in your Levanter directory:
 
-```
+```bash
 cat > .config <<EOF
 env:
     WANDB_API_KEY:
@@ -101,29 +91,41 @@ env:
     HF_TOKEN:
     TPU_STDERR_LOG_LEVEL: 0
     TPU_MIN_LOG_LEVEL: 0
-    LIBTPU_INIT_ARGS: <extra args to libtpu>
+    LIBTPU_INIT_ARGS: <extra args to libtpu>  # Optional
 
-docker_repository: levanter
-zone: us-west4-a
+docker_repository: levanter  # default
+zone: us-west4-a  # if not set, will use your default zone
 tpu_name: test-spin-up-32
 tpu_type: "v5litepod-16"
-vm_image: "tpu-ubuntu2204-base"
+vm_image: "tpu-ubuntu2204-base"  # default
 capacity_type: "preemptible"
 autodelete: false
-subnetwork: "default"
+subnetwork: "default"  # default
 
 EOF
 ```
 
-If you want to customize the docker image that is created and uploaded to GCP Artifact Registry, you can add config `image_name: "YOUR-DOCKER-NAME"`.
+If you want to customize the docker image that is created and uploaded to Artifact Registry, you can add config `image_name: "YOUR-DOCKER-NAME"`.
 
-Note that you can also configure docker to push to GHCR by setting
-```
+#### (Optional) Using GitHub Container Registry
+
+Note that you can also Configure docker to push to GHCR by setting
+
+```yaml
 docker_registry: ghcr
 github_user: <YOUR USERNAME>
 github_token: <YOUR TOKEN>
 ```
-By default, the tpu instance won't be able to access the docker image, so you may need to make it public.
+
+By default, the TPU instance won't be able to access the Docker image, so you may need to make it public. To do
+so, navigate to the GitHub profile or organization that owns the Docker image (e.g. https://github.com/orgs/stanford-crfm/packages),
+click on the package, and then click on the "Make public" button. GitHub will display a scary warning about how
+this will make the package public, but that's what you want.
+
+To get a GitHub token, see [this guide on creating access tokens](https://docs.github.com/en/github/authenticating-to-github/keeping-your-account-and-data-secure/creating-a-personal-access-token)
+and [the GitHub Container Registry docs](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry#authenticating-to-the-container-registry).
+
+### Launch a GPT-2 Small in the background
 
 Now run `launch.py`. This will package your current directory into a Docker image and run it on your workers. Everything after the `--` is run on each worker.
 
@@ -131,21 +133,53 @@ Now run `launch.py`. This will package your current directory into a Docker imag
 python infra/launch.py -- python src/levanter/main/train_lm.py --config_path config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
 ```
 
+The command you run should be run as though it's being run on the TPU VM, from the root of the Levanter repo. Everything
+in your current directory not covered by `.dockerignore` will be copied to the TPU VM. (This can lead to surprises
+if you have large files in your directory that you don't want to copy over.)
+
 ### Launch a GPT-2 Small in interactive mode
 
-To run in the foreground, use `--foreground` with the `launch.py` script. You should use tmux or something for long running jobs for this version. It's mostly for debugging.
+To run in the foreground, use `--foreground` with the `launch.py` script. You should use tmux or something for long-running jobs for this version.
+
 ```bash
 python infra/launch.py -- python src/levanter/main/train_lm.py --config_path config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
 ```
 
+### Running your own config
+
+If you want to run your own config, we suggest you start from one of the existing configs. Just copy it to
+a new file:
+
+`cp config/gpt2_small.yaml config/my_config.yaml`
+
+If you're using `launch.py`, the config will be automatically uploaded as part of your Docker image, so you
+can just reference the local config path in your command line:
+
+```
+
+Afterward, you can use the config directly from the TPU VM instance, e.g.:
+
+```bash
+    python infra/launch.py -- python src/levanter/main/train_lm.py --config_path config/my_config.yaml \
+    --trainer.checkpointer.base_path gs://path/to/checkpoints/
+```
+
+With this configuration (unless `trainer.load_checkpoint` is false), Levanter will automatically
+try to load the latest checkpoint if it exists.
+
+Tokenizers and configuration files are loaded via `fsspec` which supports remote
+filesystems, so you can also copy your tokenizer or config file to GCS and use
+a `gs://` path to access it.
+
 ### Using an external directory or file
 
 In case that you want to reference some external directory/file outside of the levanter repo, you can do it by adding the external directory/file to the docker image so that it becomes accessible in TPU instances. You can specify the path you want to add as extra buildl context by `--extra_context` with the `launch.py` script. Then, you should be able to use the external files in arguments in `train_lm.py` etc.
+
 ```bash
 python infra/launch.py --extra_context <external path> -- python src/levanter/main/train_lm.py --config_path <external path> --trainer.checkpointer.base_path gs://<somewhere>'
 ```
 
-### Babysitting Script
+### Babysitting script for preemptible TPUs
 
 If you are using a preemptible TPU VM, you probably want to use the "babysitting" version of the script to keep an eye on
 the VM. This is because preemptible instances can be preempted and will always be killed every 24 hours.
@@ -161,50 +195,70 @@ That `--` is important! It separates the spin up args from the running args.
 Also you should always use `--foregrouund` with `babysit-tpu-vm`, as the
 background mode will always return immediately.
 
-### Running your own config
 
-If you want to run your own config, we suggest you start from one of the existing configs. Just copy it to
-a new file:
+## Using the Ray Autoscaler
 
-`cp config/gpt2_small.yaml config/my_config.yaml`
+We use Ray's autoscaler to manage the TPU VM instances. This is a more robust way to manage the instances, as it will
+automatically restart them if they fail. It also allows you to easily scale up the number of instances if you need more
+compute.
 
-If you're using `launch.py`, the config will be automatically uploaded as part of your Docker image, so you
-can just reference the local config path in your command line:
+### Configuration
+
+Since Levanter already uses Ray, you don't need to install anything new. You just need to set up your configuration file.
+We have a template configuration file in `infra/cluster/job-cluster.yaml`. You can modify this file to suit your needs.
+In particular, you should set the GCP project, zone, and which TPU slice types you want to use. The default configuration
+enables v4 slices of various sizes.
 
+**Note that the default configuration uses an n2-standard-2 instance as the head node. This costs about $70/month.**
+This is considerably smaller than [Ray's guidance for the head node](https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#configuring-the-head-node).
+If you need to save money, you can also look into committing to a year of usage to save money, or potentially you could
+use a non-preemptible TPU VM instance as the head node if you have non-preemptible TRC TPUs.
+
+### Launching the Cluster
+
+To launch the cluster, you can run the following command:
+
+```bash
+ray up infra/cluster/job-cluster.yaml
 ```
 
-Afterward, you can use the config directly from the TPU VM instance, e.g.:
+This will create the head node and the minimum number of workers. You can then submit jobs to the cluster. First,
+you should establish a connection to the Ray dashboard:
 
 ```bash
-    python infra/launch.py -- python src/levanter/main/train_lm.py --config_path config/my_config.yaml \
-    --trainer.checkpointer.base_path gs://path/to/checkpoints/
+ray dashboard infra/cluster/job-cluster.yaml
 ```
 
-With this configuration (unless `trainer.load_checkpoint` is false), Levanter will automatically
-try to load the latest checkpoint if it exists.
+Then, **in a separate terminal**, you can submit a job to the cluster. To replicate the previous example, you can run:
 
-Tokenizers and configuration files are loaded via `fsspec` which supports remote
-filesystems , so you can also copy your tokenizer or config file to GCS and use
-a `gs://` path to access it.
+```bash
+export RAY_ADDRESS=http://localhost:8265  # tell ray where the cluster is
+python infra/launch_on_ray.py --tpu_type v4-32 --foreground --config_path config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
+```
 
+Even without `--foreground`, the job will be restarted if it fails. The `--tpu_type` flag is required, and should be
+one of the TPU types you enabled in the cluster configuration.
 
-## Common Issues
-### (CRFM) Permission denied on `/files`
+This command will print various options for monitoring the job. You can use the Ray dashboard to monitor the job, or you can
+stop the job with:
+
+```bash
+ray job stop <job_id>
+```
 
-If you get a permission denied error on `/files`, you probably need to run `sudo chmod -R a+rw /files/whatever` on the
-TPU VM instance. This is because the TPU VM instance sets different UID/GID for the user on each and every worker, so
-you need to make sure that the permissions are set correctly. These periodically get messed up. A umask would probably
-fix this. (TODO!)
+If `--foreground` is present, the script will tail the logs of the job.
 
-### (CRFM) Git permissions issues
+### Monitoring the Cluster
 
-Git doesn't like doing operations in a directory that is owned by root or that has too funky of permissions. If you get a git error, you probably need to
-add a safe directory on your workers:
+If you've launched the cluster, you can look at the Ray dashboard to see the status of the cluster by
+navigating to `http://localhost:8265` in your browser. You can also monitor the autoscaler logs with the following command:
 
 ```bash
-gcloud compute tpus tpu-vm ssh $NAME --zone $ZONE --worker=all --command 'git config --global --add safe.directory /files/<wherever>'
+ray exec infra/cluster/job-cluster.yaml "tail -n 100 -f /tmp/ray/session_latest/logs/monitor*"
 ```
 
+## Common Issues
+
 ### Can't find TPUs
 
 If you get the warning `No GPU/TPU found, falling back to CPU.` then something else might be using the TPU, like a zombie python
@@ -238,3 +292,67 @@ gcloud auth configure-docker ${GCP_ZONE}.pkg.dev
 # for example:
 gcloud auth configure-docker us-central2-docker.pkg.dev
 ```
+
+#### Too big of a Docker image
+
+If you're concerned your Docker images are taking too long to push, especially after a first push, you can try to
+reduce the size of the image. One way to do this is to add more entries to the `.dockerignore` file in the root of the
+Levanter repo. This file is used by Docker to determine what files to ignore when building the image.
+
+To see what files are likely taking up the most space, you can run the following command:
+
+```bash
+ncdu -X .dockerignore
+```
+
+This will show you a list of files and directories in the repo, sorted by size, excluding files that are in the `.dockerignore` file.
+(There are slight differences between how `ncdu` and Docker interpret the `.dockerignore` file, so this isn't perfect, but it's usually pretty close.)
+
+## Creating a TPU VM Instance
+
+
+
+### Automatic Setup
+
+You can use `infra/spin-up-vm.sh` to create a TPU VM instance. In addition to creating the instance, it will set up
+the venv on each worker, and it will clone the repo to `~/levanter/`.
+
+**For Public Users**:
+
+```bash
+bash infra/spin-up-vm.sh <name> -z <zone> -t <type> -n <subnetwork> [--preemptible] [--use-alpha]
+```
+
+Defaults are:
+- `zone`: `us-east1-d`
+- `type`: `v3-32`
+- `subnetwork`: `default` (set to custom VPC subnet, useful for tpuv4 configs)
+- `preemptible`: `false`
+- `use-alpha`: `false` (mark `true` for tpuv4s in alpha zones like `us-central2`)
+
+**Notes**:
+
+* This uploads setup scripts via scp. If the ssh-key that you used for Google Cloud requires passphrase or your ssh key
+path is not `~/.ssh/google_compute_engine`, you will need to modify the script.
+* The command will spam you with a lot of output, sorry.
+* If you use a preemptible instance, you probably want to use the ["babysitting" script](#babysitting-script) to
+the VM. That's explained down below in the [Running Levanter GPT-2](#running-levanter-gpt-2) section.
+
+
+## Useful commands
+
+### SSHing into one TPU VM worker
+
+`gcloud compute tpus tpu-vm ssh $name   --zone us-east1-d --worker=0`
+
+### Running a command on all workers (in parallel)
+`gcloud compute tpus tpu-vm ssh $name   --zone us-east1-d --worker=all --command="echo hello"`
+
+### SCPing a file to all workers
+`gcloud compute tpus tpu-vm scp my_file $name:path/to/file --zone us-east1-d --worker=all`
+
+### SCPing a file to one worker
+`gcloud compute tpus tpu-vm scp my_file $name:path/to/file --zone us-east1-d --worker=0`
+
+### SCPing a file from one worker
+`gcloud compute tpus tpu-vm scp $name:path/to/file my_file --zone us-east1-d --worker=0`
diff --git a/docs/design/Ray-Job-Manager.md b/docs/design/Ray-Job-Manager.md
new file mode 100644
index 000000000..88ac8bc96
--- /dev/null
+++ b/docs/design/Ray-Job-Manager.md
@@ -0,0 +1,108 @@
+# Ray TPU Job Manager
+
+This is a quick design document to explain how our Ray TPU Job Manager works.
+
+## Introduction
+
+Please see the [Ray documentation](https://docs.ray.io/en/latest/index.html) for more information on how Ray works. We provide only a brief overview here.
+
+Ray is a resource-aware job scheduler, so you can specify the resources that a job requires:
+
+```python
+@ray.remote(num_cpus=4)
+def my_cpu_job():
+    ...
+```
+
+For GPUs, Ray lets you specify the number of GPUs you need:
+
+```python
+@ray.remote(num_gpus=1)
+def my_gpu_job():
+    ...
+```
+
+In Ray, TPUs are roughly represented the same way, but there are a number of problems with that approach.
+In particular:
+
+* Ray's granularity allows it to schedule a task on a single machine, not across multiple machines. In particular,
+Ray can't directly schedule a task on a TPU slice that spans multiple machines (more precisely, multiple workers that)
+are part of the same TPU slice.)
+* Google requires that only one process on a machine can access the TPU at a time. This causes issues with Ray's
+worker pool, which doesn't exit between tasks. We need to work around this.
+
+This document explains how we work around those problems.
+
+### A Note on Terminology
+
+In the TPU world, a "TPU" is an accelerator card that is controlled by a VM called a worker. TPUs are arranged in "pods" and you can
+get a slice of a pod (e.g. v4-256). Each worker controls 4 TPU cards, which is sometimes modeled as 8 TPU devices
+and sometimes as 4 TPU devices, depending on TPU version.
+
+Ray's fundamental abstraction is the "task." A task is modeled as a Python function decorated with `@ray.remote`
+that runs in a process pool on some machine. It returns a future that can be used to get the result of the task.
+
+In this document, I use "job" to mean something like an experiment run. It's a command that we want to run on
+all the workers of a TPU slice until it completes, resuming from where it left off if it is preempted.
+To run a job on a TPU slice, we will need to create a number of tasks that run on the workers of the TPU slice.
+When a job is preempted, we need to reschedule the job by creating new tasks.
+
+## Ray+TPU
+
+### Scheduling Slices of TPUs
+
+TPU slices must be used in a SPMD manner (this is probably not quite true, but it's the easiest way to use them).
+This means that you need to run the same code on all workers of a slice at once.
+Ray can't really do this directly. That is, you can't say:
+
+```python
+@ray.remote(tpu_slice="v4-256")
+def my_tpu_job():
+    ...
+```
+
+But you almost can, with a bit of indirection. Allen Wang (@allenwang28) at Google wrote [this gist](https://gist.github.com/allenwang28/e3400b9e9212b50aa1cda55ebeccea60#file-ray_tpu_task-py) that is most
+of the way to a solution. The key idea is to schedule a task on the special `"TPU-${TPU_TYPE}-head"` resource
+(where `${TPU_TYPE}` is like `"v4-256"`). If you start a job with this resource, you essentially get a "lock" on the TPU
+slice. Once you have the lock, you can query the VM to get a unique resource that is shared only for this particular
+slice. (Specifically, this resource is the unique pod slice name of the TPU slice `ray.util.accelerators.tpu.get_current_pod_name()`.)
+You can then use this resource to schedule K tasks on the K workers that are on the same slice. These tasks do the actual work.
+
+Managing preemption is then just a question of rescheduling the job when it gets preempted: getting a new head node,
+getting a new pod slice name, and rescheduling the K tasks.
+Detecting preemption (as opposed to application failure) is a bit tricky and still not fully tested.
+
+### Dealing with `libtpu`
+
+`libtpu` is the library that interfaces with the TPU. `libtpu` has a hard requirement that only one process on a machine
+can access the TPU at a time. It manages this with a lockfile called `/tmp/libtpu_lockfile`. Ordinarily, this is fine,
+as the lockfile is removed when the process exits. However, Ray maintains a pool of workers that don't ordinarily exit
+between tasks. This means that the lockfile is not removed, and the next task that tries to access the TPU will fail.
+
+As best I can tell, it's actually fine to remove this lockfile so long as you're not trying to access the TPU from
+multiple processes on the same machine. Because we're using Ray to lock the resources, we're guaranteed that only one
+process will be accessing the TPU at a time, so we just remove the lockfile when the task finishes.
+
+Also note that we say that each worker only has 1 TPU, even though it has 4 (or 8) TPU devices. This is because
+`libtpu` only lets one process access the TPU at a time, so the TPU functions more as a boolean lock than
+as a semaphore.
+
+## Ray+TPU+Docker
+
+So above we have the core idea of how to use Ray with TPUs. However, there are a few additional complications when
+we want our jobs to be running in separate docker containers. Some of this is just dealing with Ray+Docker, but some of it
+is Ray+TPU+Docker specific.
+
+We use a Docker container to run the core Ray scheduling process on each machine. We also want to use a different
+per-job Docker container to run actual jobs. In theory, Ray can run tasks inside task-specific docker images, but I've heard it
+doesn't work well. We also want to avoid a full Docker-in-Docker setup (which I've also heard is tricky), so we
+instead want the scheduler to launch sibling containers. To do that, we bind-mount the docker socket into the
+scheduler container.
+
+## Ray+TPU+Docker
+
+Above we discussed dealing with the TPU lockfile. The only real remaining issues are:
+
+* you have to use `--privileged` to use TPUs.
+* There's a bug in Ray's TPU/Docker support that [causes the `TPU-<TPU_TYPE>-head` resource to be assigned to all workers](https://github.com/ray-project/ray/pull/47777),
+not just the leader. We have a patch.
diff --git a/infra/cluster/job-cluster.yaml b/infra/cluster/job-cluster.yaml
new file mode 100644
index 000000000..652771fcb
--- /dev/null
+++ b/infra/cluster/job-cluster.yaml
@@ -0,0 +1,148 @@
+# Configures a Ray Cluster with TPU Slices of various sizes
+# If you're at Stanford CRFM, you probably don't need to change this file
+# If you're not at Stanford CRFM, you should change this file to match your GCP project
+# Specifically:
+# - Change `project_id` to your GCP project
+# - Change the `availability_zone` to match where you have TPUs available
+# - Change the `region` to match where you have TPUs available
+# - Change to the TPU quota you have available
+# cf: https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/gcp/example-full.yaml
+# cf: https://docs.ray.io/en/latest/cluster/vms/references/ray-cluster-configuration.html
+# Unique Identifier for the Head Node + Workers
+cluster_name: levanter-cluster
+
+# Configure GCP
+provider:
+  type: gcp
+  region: us-central2
+  availability_zone: us-central2-b
+  project_id: hai-gcp-models
+
+# Maximum Workers (excluding Head Node)
+max_workers: 1024
+upscaling_speed: 4.0  # for bursty
+
+# List of Available Node Types
+available_node_types:
+  # Head Node =>> On-Demand, sets Min/Max Workers = 0 (Prevent Scheduling Tasks on Head Node)
+  head_default:
+    min_workers: 0
+    max_workers: 0
+    resources: {"CPU": 32}
+
+    # GCP-Specific Configuration; by default, Ray will configure unspecified fields (e.g., subnets, ssh-keys)
+    #   => Ref: https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+    node_config:
+      machineType: n2-standard-2
+
+      # Create a Persistent Disk w/ 100 GBs
+      disks:
+        - boot: true
+          autoDelete: true
+          type: PERSISTENT
+          initializeParams:
+            diskSizeGb: 100
+
+            # Set Source Image =>> Ubuntu 22.04 Base VM
+            sourceImage: projects/ubuntu-os-cloud/global/images/family/ubuntu-2204-lts
+
+  # Worker Nodes =>>
+  tpu_slice_v4_32:
+    min_workers: 0
+    max_workers: 1024
+    resources: { "CPU": 120, "TPU": 1 }
+
+    node_config:
+      acceleratorType: v4-32
+      runtimeVersion: tpu-ubuntu2204-base
+
+      # [IMPORTANT] Configure all TPU Workers to be Preemptible!
+      schedulingConfig:
+        preemptible: true
+
+  tpu_slice_v4_64:
+    min_workers: 0
+    max_workers: 1024
+    resources: {"CPU": 120, "TPU": 1}
+
+    node_config:
+      acceleratorType: v4-64
+      runtimeVersion: tpu-ubuntu2204-base
+
+      # [IMPORTANT] Configure all TPU Workers to be Preemptible!
+      schedulingConfig:
+        preemptible: true
+
+  # more slices
+  tpu_slice_v4_128:
+    min_workers: 0
+    max_workers: 1024
+    resources: { "CPU": 120, "TPU": 1 }
+
+    node_config:
+      acceleratorType: v4-128
+      runtimeVersion: tpu-ubuntu2204-base
+
+      # [IMPORTANT] Configure all TPU Workers to be Preemptible!
+      schedulingConfig:
+        preemptible: true
+
+  tpu_slice_v4_256:
+    min_workers: 0
+    max_workers: 1024
+    resources: { "CPU": 120, "TPU": 1 }
+
+    node_config:
+      acceleratorType: v4-256
+      runtimeVersion: tpu-ubuntu2204-base
+
+      # [IMPORTANT] Configure all TPU Workers to be Preemptible!
+      schedulingConfig:
+        preemptible: true
+
+  tpu_slice_v4_512:
+    min_workers: 0
+    max_workers: 1024
+    resources: { "CPU": 120, "TPU": 1 }
+
+    node_config:
+      acceleratorType: v4-512
+      runtimeVersion: tpu-ubuntu2204-base
+
+      # [IMPORTANT] Configure all TPU Workers to be Preemptible!
+      schedulingConfig:
+        preemptible: true
+
+docker:
+    image: "ghcr.io/stanford-crfm/levanter-cluster:latest"
+    container_name: "ray_docker"
+    pull_before_run: true
+    worker_run_options:
+        - --privileged
+        - --ulimit memlock=-1:-1  #
+        - --shm-size=32gb
+        - -e TPU_WORKER_ID
+        - -v "/tmp:/tmp"
+        # this lets the worker run docker commands and have them run as sibling containers
+        - -v "/var/run/docker.sock:/var/run/docker.sock"
+
+initialization_commands:
+  - yes | gcloud auth configure-docker us-central2-docker.pkg.dev
+  - "export TPU_WORKER_ID=$(curl -H 'Metadata-Flavor: Google' http://metadata.google.internal/computeMetadata/v1/instance/attributes/agent-worker-number) || true"
+  - which docker || (curl -fsSL https://get.docker.com -o get-docker.sh; sudo sh get-docker.sh; sudo usermod -aG docker $USER; sudo systemctl restart docker -f)
+  # always run this because ray doesn't run with sudo
+  - sudo usermod -aG docker $USER
+  # we want to launch docker containers from inside docker, which means we need to loosen the permissions on the docker
+  # socket. This isn't the best security practice, but it's the easiest way to get this working.
+  - sudo chmod 666 /var/run/docker.sock
+
+head_setup_commands:
+  - mkdir $HOME/.cache/huggingface -p
+  - gcloud secrets versions access latest --secret=HF_TOKEN > $HOME/.cache/huggingface/token || true
+
+worker_setup_commands:
+  - mkdir $HOME/.cache/huggingface -p
+  - gcloud secrets versions access latest --secret=HF_TOKEN > $HOME/.cache/huggingface/token || true
+
+# Set Head Node == `ray_head_default`
+head_node_type: head_default
diff --git a/infra/cluster/push_cluster_docker.sh b/infra/cluster/push_cluster_docker.sh
new file mode 100644
index 000000000..ca049c357
--- /dev/null
+++ b/infra/cluster/push_cluster_docker.sh
@@ -0,0 +1 @@
+python infra/push_docker.py --docker_file docker/tpu/Dockerfile.cluster --image levanter-cluster --tag latest $*
diff --git a/infra/launch.py b/infra/launch.py
index ec241fcec..05d4fffac 100755
--- a/infra/launch.py
+++ b/infra/launch.py
@@ -89,7 +89,11 @@ def main():
     tag = int(time.time())
 
     with docker.copy_extra_ctx(extra_context) as extra_context:
-        build_args = {"EXTRA_CTX": extra_context} if extra_context else None
+        build_args = {"EXTRA_CTX": extra_context} if extra_context else {}
+        base_image, base_tag = docker.split_image_and_tag(args.docker_base_image)
+        build_args["IMAGE"] = base_image
+        build_args["TAG"] = base_tag
+
         local_id = docker.build_docker(
             docker_file="docker/tpu/Dockerfile.incremental", image_name=image_id, tag=tag, build_args=build_args
         )
diff --git a/infra/launch_on_ray.py b/infra/launch_on_ray.py
new file mode 100755
index 000000000..2040aff44
--- /dev/null
+++ b/infra/launch_on_ray.py
@@ -0,0 +1,215 @@
+#!/usr/bin/python
+# Similar to launch.py, but this instead launches on a Ray cluster configured with auto-scaling TPUs
+
+import argparse
+import getpass
+import os
+import tempfile
+import time
+from pathlib import Path
+
+import draccus
+from ray.dashboard.modules.job.common import JobStatus
+from ray.dashboard.modules.job.sdk import JobSubmissionClient
+
+import levanter.infra.cli_helpers as cli
+import levanter.infra.docker as docker
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    config = cli.load_config()
+
+    cli.add_arg(parser, config, ["--docker_base_image"], default="ghcr.io/stanford-crfm/levanter-base:latest")
+    cli.add_arg(parser, config, ["--docker_repository"], default="levanter")
+    cli.add_arg(parser, config, ["--address"], default="http://127.0.0.1:8265")
+    cli.add_arg(parser, config, ["--image_name"], default=f"levanter-{getpass.getuser()}")
+    cli.add_capacity_type_args(parser, config)
+    cli.add_arg(parser, config, ["--project"], default=cli.gcloud_config()["project"])
+    cli.add_arg(parser, config, ["--tpu_type"], required=True)
+    # TODO: bring node_count to Ray
+    # cli.add_arg(parser, config, ["--node_count"], default=1, type=int)
+    cli.add_arg(parser, config, ["--foreground"], default=False, action="store_true")
+    cli.add_arg(parser, config, ["--retries"], default=10, type=int)
+    cli.add_arg(parser, config, ["--run_id"], default=cli.default_run_id(), type=str)
+    cli.add_arg(parser, config, ["--docker_registry"], default="gcp", choices=["gcp", "ghcr"])
+    cli.add_arg(parser, config, ["--github_user"], type=str)
+    cli.add_arg(parser, config, ["--github_token"], type=str)
+    cli.add_arg(parser, config, ["--extra_context"], type=Path, required=False, default=None)
+    cli.add_arg(parser, config, ["--zone"], default=None, type=str, required=False)
+
+    parser.add_argument(
+        "-e", "--env", action="append", nargs=2, metavar=("KEY", "VALUE"), default=list(config.get("env", {}).items())
+    )
+    parser.add_argument("command", nargs=argparse.REMAINDER)
+
+    args = parser.parse_args()
+
+    command = args.command
+    docker_repository = args.docker_repository
+    image_id = args.image_name
+    project = args.project
+    if args.retries < 0:
+        retries = 10000000
+    else:
+        retries = args.retries
+
+    tpu_type = args.tpu_type
+
+    zone = args.zone
+    run_id = args.run_id
+    registry = args.docker_registry
+    github_user = args.github_user
+    github_token = args.github_token
+    extra_context = args.extra_context
+
+    if zone is None:
+        zone = cli.gcloud_config()["zone"]
+
+    if zone is None:
+        raise ValueError("Zone must be specified or set in gcloud config.")
+
+    region = "-".join(zone.split("-")[:-1])
+
+    if command[0] == "--":
+        command = command[1:]
+
+    # make an image tag based on the unix timestamp to ensure we always pull the latest image
+    tag = int(time.time())
+
+    with docker.copy_extra_ctx(extra_context) as extra_context:
+        build_args = {"EXTRA_CTX": extra_context} if extra_context else {}
+        base_image, base_tag = docker.split_image_and_tag(args.docker_base_image)
+        build_args["IMAGE"] = base_image
+        build_args["TAG"] = base_tag
+
+        local_id = docker.build_docker(
+            docker_file="docker/tpu/Dockerfile.incremental", image_name=image_id, tag=tag, build_args=build_args
+        )
+
+    if registry == "ghcr":
+        full_image_id = docker.push_to_github(
+            local_id=local_id,
+            github_user=github_user,
+            github_token=github_token,
+        )
+    elif registry == "gcp":
+        full_image_id = docker.push_to_gcp(
+            local_id=local_id,
+            project_id=project,
+            region=region,
+            repository=docker_repository,
+        )
+    else:
+        raise ValueError(f"Unknown docker registry: {registry}")
+
+    env = {k: v for k, v in args.env}
+
+    if "WANDB_PROJECT" not in env:
+        env["WANDB_PROJECT"] = "levanter"
+
+    env["GIT_COMMIT"] = cli.get_git_commit()
+    env["RUN_ID"] = run_id
+    env["WANDB_DOCKER"] = full_image_id
+
+    # run_docker_on_pod(
+    #     full_image_id,
+    #     command=command,
+    #     tpu_type=tpu_type,
+    #     env=env,
+    #     retries=retries,
+    # )
+
+    # Submit the job to the Ray cluster. We have to use the JobSubmissionClient to do this and stringify the arguments
+    # we want:
+    from levanter.infra.ray_tpu import RunOnPodConfig
+
+    config = RunOnPodConfig(
+        image_id=full_image_id,
+        command=command,
+        tpu_type=tpu_type,
+        env=env,
+        name="levanter",
+        retries=retries,
+    )
+
+    with tempfile.NamedTemporaryFile(suffix=".yaml", prefix=f"launch-{run_id}-", dir=".") as f:
+        yaml = draccus.dump(config)
+        f.write(yaml.encode("utf-8"))
+        f.flush()
+
+        f_name = os.path.relpath(f.name)
+        print(f"Submitting job with config path {f_name}")
+
+        client = JobSubmissionClient(args.address)
+
+        job_id = _make_unique_job_id(client, run_id)
+
+        job_id = client.submit_job(
+            entrypoint=f"python src/levanter/infra/ray_tpu.py --config_path {f_name}",
+            runtime_env={"working_dir": "./"},
+            job_id=job_id,
+        )
+
+        print(
+            f"""
+-------------------------------------------------------
+Job '{job_id}' submitted successfully
+-------------------------------------------------------
+
+Next steps
+  Query the logs of the job:
+    ray job logs {job_id}
+  Query the status of the job:
+    ray job status {job_id}
+  Request the job to be stopped:
+    ray job stop {job_id}
+"""
+        )
+
+    if args.foreground:
+
+        async def tail_job(job_id):
+            async for line in client.tail_job_logs(job_id):  # type: ignore
+                print(line, end="")
+
+                status = client.get_job_status(job_id)
+                if status in {JobStatus.FAILED, JobStatus.SUCCEEDED, JobStatus.STOPPED}:
+                    break
+
+        print("Tailing job logs")
+        wait_until_status(
+            client, job_id, {JobStatus.RUNNING, JobStatus.FAILED, JobStatus.SUCCEEDED, JobStatus.STOPPED}
+        )
+        # tail_job(job_id)
+        import asyncio
+
+        asyncio.run(tail_job(job_id))
+
+
+def wait_until_status(client, job_id, status_to_wait_for, timeout_seconds=5):
+    start = time.time()
+    while time.time() - start <= timeout_seconds:
+        status = client.get_job_status(job_id)
+        print(f"status: {status}")
+        if status in status_to_wait_for:
+            break
+        time.sleep(1)
+
+
+# try to make the job id be the same as the run id, but if it already exists, just make it unique
+def _make_unique_job_id(client, run_id):
+    job_id = run_id
+    try:
+        while client.get_job_status(job_id) is not None:
+            job_id = f"{run_id}-{time.time_ns()}"
+    except Exception as e:  # noqa
+        if "does not exist" in str(e):
+            pass
+        else:
+            raise
+    return job_id
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index 0b72b20f4..add95b4ff 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,7 @@ classifiers = [
 ]
 dependencies = [
     "haliax>=1.4.dev307",
-    "equinox==0.11.6",
+    "equinox==0.11.3",
     "jaxtyping>=0.2.20",
     "tokenizers>=0.15.2",
     "transformers>=4.41.2",
@@ -37,14 +37,14 @@ dependencies = [
     "braceexpand>=0.1.7",
     "jmp>=0.0.3",
     "fsspec[http]>=2024.2,<2024.10",
-    "tensorstore==0.1.65",
+    "tensorstore==0.1.63",
     "pytimeparse>=1.1.8",
     "humanfriendly==10.0",
     "safetensors[numpy]~=0.4.2",
     "matplotlib>=3.7.0",
     "tblib>=1.7.0,<4.0.0",
     "dataclasses-json~=0.6.4",
-    "ray[default]>=2.34.0",
+    "ray[default]==2.34.0",
     "pydantic<3",
     "rich~=13.0",
     "filelock~=3.13",
diff --git a/src/levanter/distributed.py b/src/levanter/distributed.py
index 112409743..6efd9f0cb 100644
--- a/src/levanter/distributed.py
+++ b/src/levanter/distributed.py
@@ -276,7 +276,12 @@ def _munge_address_port(address: str):
             else:
                 logger.warning(f"Failed to initialize ray with address {address}. Retrying...")
                 continue
-    atexit.register(lambda: ray.shutdown())
+
+    def do_shutdown():
+        logger.info("Shutting down ray...")
+        ray.shutdown()
+
+    atexit.register(do_shutdown)
     _already_initialized = True
 
 
diff --git a/src/levanter/infra/cli_helpers.py b/src/levanter/infra/cli_helpers.py
index 5b1f87f01..eef8fa969 100644
--- a/src/levanter/infra/cli_helpers.py
+++ b/src/levanter/infra/cli_helpers.py
@@ -59,12 +59,12 @@ def get_git_commit():
     return subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
 
 
-def make_docker_run_command(image_id, command, *, foreground, env):
+def make_docker_run_command(image_id, command, *, foreground, env, name="levanter"):
     docker_command = [
         "docker",
         "run",
         "-t" if foreground else "-d",
-        "--name=levanter",
+        f"--name={name}",
         "--privileged",
         "--shm-size=32gb",
         "--net=host",
@@ -76,9 +76,9 @@ def make_docker_run_command(image_id, command, *, foreground, env):
     ]
 
     for k, v in env.items():
-        docker_command.extend(["-e", k + f"='{str(v)}'"])
+        docker_command.extend(["-e", k + f"={str(v)}"])
 
-    docker_command.extend([image_id, " ".join(command)])
+    docker_command.extend([image_id, *command])
     return docker_command
 
 
diff --git a/src/levanter/infra/docker.py b/src/levanter/infra/docker.py
index 39f3b1325..d48b558a5 100644
--- a/src/levanter/infra/docker.py
+++ b/src/levanter/infra/docker.py
@@ -227,3 +227,12 @@ def push_to_gcp(local_id, project_id, region, repository) -> str:
     _run(["docker", "push", full_image_name])
 
     return f"{artifact_repo}/{local_id}"
+
+
+def split_image_and_tag(docker_base_image):
+    if ":" in docker_base_image:
+        base_image, base_tag = docker_base_image.rsplit(":", 1)
+    else:
+        base_image = docker_base_image
+        base_tag = "latest"
+    return base_image, base_tag
diff --git a/src/levanter/infra/ray_tpu.py b/src/levanter/infra/ray_tpu.py
new file mode 100644
index 000000000..69f25d02a
--- /dev/null
+++ b/src/levanter/infra/ray_tpu.py
@@ -0,0 +1,319 @@
+import dataclasses
+import logging
+import os
+import subprocess
+from dataclasses import dataclass
+from typing import Sequence
+
+import draccus
+import ray
+from ray.exceptions import NodeDiedError, RayError, RaySystemError, RayTaskError, WorkerCrashedError
+from ray.remote_function import RemoteFunction
+
+from levanter.infra.cli_helpers import make_docker_run_command
+
+
+# CF https://gist.github.com/allenwang28/e3400b9e9212b50aa1cda55ebeccea60
+
+logger = logging.getLogger("ray")
+
+
+@dataclass
+class _TpuInfo:
+    """Internal class to hold information about a TPU pod."""
+
+    name: str
+    state: str
+    kind: str
+
+
+# My kingdom for ADTs
+@dataclass
+class _TpuRunResult:
+    """Internal class to hold the result of a TPU job."""
+
+    info: _TpuInfo
+
+
+@dataclass
+class TpuSuccess(_TpuRunResult):
+    result: object
+
+
+@dataclass
+class TpuPreempted(_TpuRunResult):
+    error: Exception
+
+
+@dataclass
+class TpuFailed(_TpuRunResult):
+    error: Exception
+
+
+@dataclass
+class TpuRunError(_TpuRunResult):
+    error: Exception
+
+
+def run_on_pod(remote_fn: RemoteFunction, tpu_type: str):
+    """
+    Run a remote function on a TPU pod.
+
+    Args:
+        remote_fn: A remote function that takes no arguments
+        tpu_type: The type of TPU to run on, e.g. "v4-32"
+    """
+
+    @ray.remote(resources={f"TPU-{tpu_type}-head": 1})
+    def do_run(remote_fn) -> _TpuRunResult:
+        tpu_name = ray.util.accelerators.tpu.get_current_pod_name()  # -> my-tpu
+        num_hosts = ray.util.accelerators.tpu.get_current_pod_worker_count()  # -> 4
+        remote_fn = remote_fn.options(resources={tpu_name: 1, "TPU": 1})
+
+        info = _TpuInfo(tpu_name, "ACTIVE", "TPU")
+        try:
+            try:
+                out = ray.get([remote_fn.remote() for _ in range(num_hosts)])
+                logger.info("TPU job finished")
+                return TpuSuccess(info, out)
+            except RayError as e:
+                return _handle_ray_error(info, e)
+        finally:
+            # remove the tpu lockfile on each host
+            logger.debug("Removing lockfiles")
+            _rm_lockfile = ray.remote(resources={tpu_name: 1, "TPU": 1})(_hacky_remove_tpu_lockfile)
+            try:
+                ray.get([_rm_lockfile.remote() for _ in range(num_hosts)])
+            except Exception:
+                logger.exception("Failed to remove lockfile")
+                # swallow the exception
+
+    return do_run.remote(remote_fn)
+
+
+def run_on_pod_resumable(remote_fn, tpu_type, max_retries_preemption=1e6, max_retries_failure=10):
+    """
+    Repeatedly run a function on a TPU pod until it succeeds or a maximum number of retries is reached.
+
+    Args:
+        remote_fn: A remote function that takes no arguments
+        tpu_type: The type of TPU to run on, e.g. "v4-32"
+        max_retries_preemption: The maximum number of times to retry if the job is preempted
+        max_retries_failure: The maximum number of times to retry if the job fails
+    """
+    num_failures = 0
+    num_preemptions = 0
+
+    while num_failures < max_retries_failure and num_preemptions < max_retries_preemption:
+        try:
+            out = ray.get(run_on_pod(remote_fn, tpu_type))
+            if isinstance(out, TpuSuccess):
+                result = out.result
+                logger.info("Success")
+                return result
+            elif isinstance(out, TpuPreempted):
+                e = out.error
+                num_preemptions += 1
+                print(f"Preempted {num_preemptions} times. {e}")
+                logger.warning(f"Preempted {num_preemptions} times. {e}", exc_info=e)
+            elif isinstance(out, TpuFailed):
+                num_preemptions += 1
+                logger.warning(f"TPU node failure. Treating as preempted: {num_preemptions} times")
+            elif isinstance(out, TpuRunError):
+                e = out.error
+                num_failures += 1
+                logger.warning(f"Failed {num_failures} times")
+                logger.exception(e)
+            else:
+                raise RuntimeError(f"Unexpected result: {out}")
+        except ray.exceptions.RayTaskError as e:
+            if "preempted" in str(e):
+                num_preemptions += 1
+                logger.warning(f"Preempted {num_preemptions} times, {e}")
+            else:
+                num_failures += 1
+                logger.warning(f"Failed {num_failures} times")
+        except Exception as e:
+            num_failures += 1
+            logger.warning(f"Failed {num_failures} times")
+            logger.exception(e)
+            if num_failures >= max_retries_failure:
+                raise e
+
+        if num_preemptions >= max_retries_preemption:
+            raise RuntimeError("Preempted too many times")
+        elif num_failures >= max_retries_failure:
+            raise RuntimeError("Failed too many times")
+
+
+def _run_command(*args, **kwargs):
+    return subprocess.check_call(args, **kwargs)
+
+
+def run_docker_on_pod(image_id: str, command: Sequence[str], *, tpu_type: str, env: dict, name="levanter", retries=10):
+    env = _massage_env(env)
+
+    docker_cmd = make_docker_run_command(image_id, command, env=env, foreground=True, name=name)
+
+    def run_docker():
+        _kill_old_container(name)
+        try:
+            return _run_command(*docker_cmd)
+        except subprocess.CalledProcessError as e:
+            logger.exception("Failed to run docker command")
+            raise e
+
+    run_on_pod_resumable(
+        ray.remote(run_docker), tpu_type=tpu_type, max_retries_failure=retries, max_retries_preemption=10000
+    )
+
+
+def _kill_old_container(name):
+    try:
+        _run_command("sudo", "docker", "rm", "-f", name)
+    except subprocess.CalledProcessError:
+        pass
+
+
+def _handle_ray_error(tpu_info: _TpuInfo, e: RayError):
+    """
+    Handle a Ray error that occurred on a TPU pod. Tries to determine if the error was due to a
+    node failure or preemption or just an application error.
+    """
+    # treat node failures as preemptions
+    if isinstance(e, NodeDiedError):
+        print("Node died")
+        logger.exception("Node died", exc_info=e)
+        return TpuPreempted(tpu_info, e)
+    elif isinstance(e, WorkerCrashedError):
+        print("Worker crashed")
+        logger.exception("Worker crashed", exc_info=e)
+        return TpuPreempted(tpu_info, e)
+    elif isinstance(e, RaySystemError):
+        logger.exception("System error", exc_info=e)
+        return TpuRunError(tpu_info, e)
+    elif isinstance(e, RayTaskError):
+        # node preemptions don't always show up as one of the above errors and can just be a RayTaskError. We have
+        # to try to sniff out the TPU's status.
+        from levanter.infra.tpus import get_current_tpu_is_preempted
+
+        if get_current_tpu_is_preempted():
+            print("Preempted")
+            logger.exception("Preempted", exc_info=e)
+            return TpuPreempted(tpu_info, e)
+
+        logger.exception(f"Task error {e}", exc_info=e)
+        return TpuRunError(tpu_info, e)
+
+    else:
+        logger.exception("Unknown error", exc_info=e)
+        return TpuRunError(tpu_info, e)
+
+
+@dataclass
+class RunOnPodConfig:
+    image_id: str
+    command: list[str] | str
+    tpu_type: str
+    env: dict = dataclasses.field(default_factory=dict)
+    name: str = "levanter"
+    retries: int = 10
+
+
+@draccus.wrap()
+def main(args: RunOnPodConfig):
+    """
+    Run a command on a TPU pod. This is a wrapper around `run_docker_on_pod` that takes a config object as a CLI.
+
+    We use this via infra/launch_on_ray.py to run docker containers on TPUs.
+    """
+    ray.init()
+
+    import shlex
+
+    if isinstance(args.command, str):
+        command = shlex.split(args.command)
+    else:
+        command = args.command
+
+    run_docker_on_pod(
+        args.image_id,
+        command,
+        tpu_type=args.tpu_type,
+        env=args.env,
+        name=args.name,
+    )
+
+
+def _hacky_remove_tpu_lockfile():
+    """
+    This is a hack to remove the lockfile that TPU pods create on the host filesystem.
+
+    libtpu only allows one process to access the TPU at a time, and it uses a lockfile to enforce this.
+    Ordinarily a lockfile would be removed when the process exits, but in the case of Ray, the process is
+    a long-running daemon that doesn't typically exit until the node is shut down. This means that the lockfile
+    persists across Ray tasks. This doesn't apply to our docker-based workloads, but it does apply to other
+    tasks that use JAX directly.
+    """
+    try:
+        os.unlink("/tmp/libtpu_lockfile")
+    except FileNotFoundError:
+        pass
+    except PermissionError:
+        logger.warning("Failed to remove lockfile")
+        try:
+            os.system("sudo rm /tmp/libtpu_lockfile")
+        except Exception:  # noqa
+            pass
+
+
+def _massage_env(env):
+    # Ray pretends it's running in a TTY, which leads to a ton of log spam from tqdm.
+    # Levanter uses tqdm_loggable, which tries to sniff out the TTY, but it doesn't work with Ray.
+    # So we force it
+    if "TERM" not in env:
+        env = {**env, "TERM": "dumb"}
+
+    return env
+
+
+if __name__ == "__main__":
+    main()
+
+    # leaving this here for testing purposes
+    # ray.init()
+    # tpu_type = "v4-64"
+    # @ray.remote
+    # def fn():
+    #     import jax
+    #     import jax.random as jrandom
+    #     from jax.lax import with_sharding_constraint
+    #     from jax.sharding import PartitionSpec as P, Mesh
+    #     mesh = Mesh(jax.devices("tpu"), ("x",))
+    #     sharding = jax.sharding.NamedSharding(mesh, P('x'))
+    #     print(jax.devices())
+    #
+    #     @jax.jit
+    #     def init():
+    #         x = jrandom.normal(jrandom.PRNGKey(0), (32,))
+    #         weights = jrandom.normal(jrandom.PRNGKey(1), (32, 4))
+    #         bias = jrandom.normal(jrandom.PRNGKey(2), (4,))
+    #
+    #         x_sharded = jax.device_put(x, sharding)
+    #         weights_sharded = jax.device_put(weights, sharding)
+    #         return x_sharded, weights_sharded, bias
+    #
+    #     x, weights, bias = init()
+    #
+    #     @jax.jit
+    #     def layer(x, weights, bias):
+    #         with mesh:
+    #             return with_sharding_constraint(jax.nn.sigmoid(x @ weights + bias), P())
+    #
+    #     out = layer(x, weights, bias)
+    #
+    #     print(out)
+    #     import numpy
+    #     return numpy.array(out)
+    # results = ray.get(run_on_pod(fn, tpu_type))
+    # print(results)
diff --git a/src/levanter/infra/tpus.py b/src/levanter/infra/tpus.py
index b8a8df9e0..bbb1cc5f5 100644
--- a/src/levanter/infra/tpus.py
+++ b/src/levanter/infra/tpus.py
@@ -1,15 +1,21 @@
 import concurrent.futures
 import getpass
 import json
+import logging
 import os
 import subprocess
 import sys
 import time
 from typing import Optional
 
+import requests  # type: ignore
+
 from levanter.infra.cli_helpers import make_docker_run_command
 
 
+logger = logging.getLogger(__name__)
+
+
 def setup_vm_docker(tpu_name, zone, node_count):
     """Change docker permissions on `tpu_name`, remove any old runs, and setup the cache volume."""
     tpu_ssh(
@@ -55,7 +61,7 @@ def list_tpus(zone):
     )
 
 
-def describe_tpu(tpu_name, zone):
+def describe_tpu_queued_resource(tpu_name, zone):
     try:
         return json.loads(
             subprocess.check_output(
@@ -78,12 +84,35 @@ def describe_tpu(tpu_name, zone):
         return None
 
 
-def start_tpu_vm(tpu_name, *, tpu_type, capacity_type, version, zone, node_count):
+def describe_tpu_vm(tpu_name, zone):
+    try:
+        return json.loads(
+            subprocess.check_output(
+                [
+                    "gcloud",
+                    "alpha",
+                    "compute",
+                    "tpus",
+                    "tpu-vm",
+                    "describe",
+                    tpu_name,
+                    f"--zone={zone}",
+                    "--format=json(name.basename(), state)",
+                    "--quiet",
+                ],
+                stderr=subprocess.DEVNULL,
+            )
+        )
+    except subprocess.CalledProcessError:
+        return None
+
+
+def start_tpu_vm_queued_resources(tpu_name, *, tpu_type, capacity_type, version, zone, node_count):
     # ensure alpha is enabled
     run_command("gcloud", "components", "install", "alpha", "--quiet")
     if version is None:
         version = "tpu-ubuntu2204-base"
-    tpu_stat = describe_tpu(tpu_name, zone)
+    tpu_stat = describe_tpu_queued_resource(tpu_name, zone)
     if tpu_stat is not None:
         if tpu_stat["state"]["state"] in ["FAILED", "SUSPENDED"]:
             print("TPU suspended,  deleting...", file=sys.stderr)
@@ -144,7 +173,7 @@ def start_tpu_vm(tpu_name, *, tpu_type, capacity_type, version, zone, node_count
         time.sleep(60)
         waited += 1
 
-        tpu_stat = describe_tpu(tpu_name, zone)
+        tpu_stat = describe_tpu_queued_resource(tpu_name, zone)
         assert tpu_stat is not None, f"{tpu_name} creation failed."
 
         match tpu_stat["state"]["state"]:
@@ -170,7 +199,7 @@ def launch_job(
     foreground: bool,
     version: Optional[str] = None,
 ):
-    start_tpu_vm(
+    start_tpu_vm_queued_resources(
         tpu_name=tpu_name,
         tpu_type=tpu_type,
         capacity_type=capacity_type,
@@ -277,3 +306,49 @@ def _tpu_ssh_multislice(tpu_name, zone, node_count, *args, ignore_failure=False)
                     print("Ignoring failure:", e)
                 else:
                     raise
+
+
+GCE_TPU_ACCELERATOR_ENDPOINT = "http://metadata.google.internal/computeMetadata/v1/instance/attributes/"
+GCE_TPU_HEADERS = {"Metadata-Flavor": "Google"}
+
+
+def get_current_tpu_metadata(key: str) -> Optional[str]:
+    # cribbed from Ray.
+    """Poll and get TPU metadata. This only works on a **TPU VM**."""
+    try:
+        accelerator_type_request = requests.get(
+            os.path.join(GCE_TPU_ACCELERATOR_ENDPOINT, key),
+            headers=GCE_TPU_HEADERS,
+        )
+        if accelerator_type_request.status_code == 200 and accelerator_type_request.text:
+            return accelerator_type_request.text
+        else:
+            logging.debug(
+                "Unable to poll TPU GCE Metadata. Got "
+                f"status code: {accelerator_type_request.status_code} and "
+                f"content: {accelerator_type_request.text}"
+            )
+    except requests.RequestException as e:
+        logging.debug("Unable to poll the TPU GCE Metadata: %s", e)
+    return None
+
+
+def get_current_tpu_is_preempted() -> bool:
+    """curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/preempted"""
+    try:
+        preempted_request = requests.get(
+            "http://metadata.google.internal/computeMetadata/v1/instance/preempted",
+            headers=GCE_TPU_HEADERS,
+        )
+        if preempted_request.status_code == 200:
+            return preempted_request.text == "TRUE"
+        else:
+            logging.warning(
+                "Unable to poll TPU preempted status. Got "
+                f"status code: {preempted_request.status_code} and "
+                f"content: {preempted_request.text}"
+            )
+            return False
+    except requests.RequestException as e:
+        logging.debug("Unable to poll TPU preempted status: %s", e)
+        raise e
diff --git a/src/levanter/store/cache.py b/src/levanter/store/cache.py
index 6db7693fe..608019374 100644
--- a/src/levanter/store/cache.py
+++ b/src/levanter/store/cache.py
@@ -364,7 +364,6 @@ def _attempt_to_write_batches(self):
     def _dequeue_ready_batches(self):
         for shard, batch in self._batch_queue.drain():
             logger.debug(f"Writing batch for {shard}")
-            batch = _canonicalize_batch(batch)
             self._total_queue_length -= len(batch)
             self._ordered_but_unwritten_items.extend(batch)
             self._batches_in_next_write_by_shard[shard] = self._batches_in_next_write_by_shard.get(shard, 0) + len(

From 8ad30747a1f977812bc2083d3aa0d0b150fc197f Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Tue, 24 Sep 2024 16:04:49 -0700
Subject: [PATCH 65/94] see if it's this file in particular (#742)

---
 tests/whisper_test.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/whisper_test.py b/tests/whisper_test.py
index 048f7f124..544ef02bb 100644
--- a/tests/whisper_test.py
+++ b/tests/whisper_test.py
@@ -4,6 +4,7 @@
 import jax
 import jax.numpy as jnp
 import numpy as onp
+import pytest
 from datasets import load_dataset
 from jax.random import PRNGKey
 from transformers import WhisperConfig as HfWhisperConfig
@@ -21,6 +22,7 @@
 from test_utils import skip_if_no_soundlibs, skip_if_no_torch
 
 
+@pytest.mark.skip
 @skip_if_no_soundlibs
 def test_whisper_loss():
     c = HfWhisperConfig.from_pretrained("openai/whisper-tiny")
@@ -50,6 +52,7 @@ def test_whisper_loss():
     model.compute_loss(AudioTextExample.init(na, inp, attn_mask=mask))
 
 
+@pytest.mark.skip
 @skip_if_no_soundlibs
 def test_basic_forward_whisper():
     c = HfWhisperConfig.from_pretrained("openai/whisper-tiny")
@@ -75,6 +78,7 @@ def test_basic_forward_whisper():
     model(na, inp)
 
 
+@pytest.mark.skip
 @skip_if_no_soundlibs
 def test_mask_forward_whisper():
     c = HfWhisperConfig.from_pretrained("openai/whisper-tiny")
@@ -100,6 +104,7 @@ def test_mask_forward_whisper():
     model(na, inp, attn_mask=AttentionMask.causal())
 
 
+@pytest.mark.skip
 @skip_if_no_soundlibs
 def test_namedarray_mask_forward_whisper():
     c = HfWhisperConfig.from_pretrained("openai/whisper-tiny")
@@ -125,6 +130,7 @@ def test_namedarray_mask_forward_whisper():
     model(na, inp, attn_mask=AttentionMask.causal().explicit_mask)
 
 
+@pytest.mark.skip
 @skip_if_no_soundlibs
 @skip_if_no_torch
 def test_hf_roundtrip():

From 541ff126de9e1e92ef21155f64ecbbf5021f1480 Mon Sep 17 00:00:00 2001
From: Oleg <142805497+devactivity-team@users.noreply.github.com>
Date: Wed, 25 Sep 2024 02:57:41 +0300
Subject: [PATCH 66/94] Update README.md (#656)

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 13097d7dd..aa999e0ae 100644
--- a/README.md
+++ b/README.md
@@ -200,6 +200,8 @@ Please see the [CUDA Getting Started](docs/Getting-Started-GPU.md) guide for mor
 
 ## Contributing
 
+[![GitHub repo Good Issues for newbies](https://img.shields.io/github/issues/stanford-crfm/levanter/good%20first%20issue?style=flat&logo=github&logoColor=green&label=Good%20First%20issues)](https://github.com/stanford-crfm/levanter/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) [![GitHub Help Wanted issues](https://img.shields.io/github/issues/stanford-crfm/levanter/help%20wanted?style=flat&logo=github&logoColor=b545d1&label=%22Help%20Wanted%22%20issues)](https://github.com/stanford-crfm/levanter/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22) [![GitHub Help Wanted PRs](https://img.shields.io/github/issues-pr/stanford-crfm/levanter/help%20wanted?style=flat&logo=github&logoColor=b545d1&label=%22Help%20Wanted%22%20PRs)](https://github.com/stanford-crfm/levanter/pulls?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22) [![GitHub repo Issues](https://img.shields.io/github/issues/stanford-crfm/levanter?style=flat&logo=github&logoColor=red&label=Issues)](https://github.com/stanford-crfm/levanter/issues?q=is%3Aopen)
+
 We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for more information.
 
 ## License

From 91be677d56efbab8babf507e7742bbceeb7ef93a Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Tue, 24 Sep 2024 17:14:52 -0700
Subject: [PATCH 67/94] bump levanter version (#743)

---
 pyproject.toml           | 2 +-
 src/levanter/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index add95b4ff..babf664e9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "levanter"
-version = "1.1"
+version = "1.2"
 authors = [
     { name = "David Hall", email = "dlwh@cs.stanford.edu" },
     { name = "Ivan Zhou", email = "ivanz@stanford.edu" },
diff --git a/src/levanter/__init__.py b/src/levanter/__init__.py
index 2674d5bd6..b969828bc 100644
--- a/src/levanter/__init__.py
+++ b/src/levanter/__init__.py
@@ -13,4 +13,4 @@
 from levanter.trainer import initialize
 
 
-__version__ = "1.1"
+__version__ = "1.2"

From cd82fb3d326bff91e33683f1adc86ca89c4992cf Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Tue, 24 Sep 2024 21:54:37 -0700
Subject: [PATCH 68/94] Make new tokenization ~67% faster (#744)

---
 scripts/launch_gpt2_small_fast_tpu.sh |  2 +-
 src/levanter/store/cache.py           | 45 ++++++++++++++++++++++-----
 tests/test_new_cache.py               | 18 +++++++++++
 3 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/scripts/launch_gpt2_small_fast_tpu.sh b/scripts/launch_gpt2_small_fast_tpu.sh
index 7b2634749..0c09cdcfa 100644
--- a/scripts/launch_gpt2_small_fast_tpu.sh
+++ b/scripts/launch_gpt2_small_fast_tpu.sh
@@ -1,6 +1,6 @@
 # Launches the "gpt_small_fast" model on a TPU node
 
-python infra/launch.py --foreground --tpu_name levanter-itest-32 --zone us-central2-b --tpu_type v4-32 --preemptible -- \
+python infra/launch.py --foreground --tpu_name $(whoami)-levanter-itest-32 --zone us-central2-b --tpu_type v4-32 --preemptible -- \
     python -m levanter.main.train_lm \
     --config_path config/gpt2_small_fast.yaml \
     --trainer.checkpointer.base_path gs://levanter-checkpoints/gpt-itest/ --trainer.checkpointer.save_interval 30m $*
diff --git a/src/levanter/store/cache.py b/src/levanter/store/cache.py
index 608019374..56aa54f99 100644
--- a/src/levanter/store/cache.py
+++ b/src/levanter/store/cache.py
@@ -272,6 +272,11 @@ def __init__(
             # double check that we're not finished by committing the ledger
             self._attempt_to_write_batches()
 
+            if not self._ledger.is_finished:
+                self._actual_writer_thread = threading.Thread(target=self._write_loop, daemon=True)
+                self._stop_loop = threading.Event()
+                self._actual_writer_thread.start()
+
     def batch_finished(self, shard_name: str, shard_batch_idx: int, batch_result_box):
         with log_failures_to(self._parent):
             if self._failed:
@@ -286,7 +291,6 @@ def batch_finished(self, shard_name: str, shard_batch_idx: int, batch_result_box
             # we need to keep track of the order of the batches so that we can write them out in order
             self._total_queue_length += len(batch_result)
             self._batch_queue.append_to_group(shard_name, shard_batch_idx, batch_result)
-            self._attempt_to_write_batches()
             next_missing_item = self._batch_queue.next_missing_item_index()
 
             overwhelmed = self.is_overwhelmed()
@@ -303,6 +307,7 @@ def batch_finished(self, shard_name: str, shard_batch_idx: int, batch_result_box
     def shard_failed(self, shard_name: str, batch_id: int, exc_info: ExceptionInfo):
         with log_failures_to(self._parent):
             self._failed = True
+            self._stop_loop.set()
             logger.error(f"Shard {shard_name} failed at batch {batch_id}", exc_info=exc_info.restore())
             self._parent.shard_failed.remote(shard_name, exc_info)
 
@@ -314,7 +319,10 @@ def shard_finished_reading(self, shard_name: str, expected_num_rows: int):
             logger.debug(
                 f"Attempting to write batches because {shard_name} finished reading with {expected_num_rows} batches."
             )
-            self._attempt_to_write_batches()
+            self.flush()
+
+    def flush(self):
+        self._attempt_to_write_batches()
 
     def get_shard_status(self, shard_name: str):
         with log_failures_to(self._parent):
@@ -327,7 +335,7 @@ def get_ledger(self):
 
     def _attempt_to_write_batches(self):
         if self._ledger.is_finished:
-            raise RuntimeError("Trying to write batches after cache is finished")
+            return
 
         if self._failed:
             logger.warning("Not writing batches because of failure.")
@@ -361,6 +369,22 @@ def _attempt_to_write_batches(self):
 
         ray.wait(futures_to_await + futures_to_await_shards)
 
+    def _finish(self):
+        self._stop_loop.set()
+        self._actual_writer_thread.join()
+
+    def _write_loop(self):
+        while True:
+            try:
+                self._stop_loop.wait(1)
+                if self._stop_loop.is_set():
+                    break
+            except TimeoutError:
+                pass
+            self._attempt_to_write_batches()
+            if self._ledger.is_finished:
+                break
+
     def _dequeue_ready_batches(self):
         for shard, batch in self._batch_queue.drain():
             logger.debug(f"Writing batch for {shard}")
@@ -422,6 +446,9 @@ def is_overwhelmed(self) -> bool:
         max_queue_size = self._min_items_to_write * 3
         return self._total_queue_length > max_queue_size
 
+    def __del__(self):
+        self._finish()
+
 
 def _to_list_of_dicts(batch: dict) -> List[dict]:
     """
@@ -940,16 +967,16 @@ async def get_batch(self, indices: Sequence[int] | slice):
 
         return await self.store.get_batch(indices)
 
-    async def _wait_for_len(self, needed_len):
+    async def _wait_for_len(self, needed_len: int):
         if self._broker is not None:
             while needed_len > await self.current_len():
-                new_ledger = await self._broker.updated_ledger.remote()
+                new_ledger: CacheLedger = await self._broker.updated_ledger.remote()
 
                 if needed_len <= new_ledger.total_num_rows:
                     break
 
                 if new_ledger.is_finished:
-                    if needed_len >= new_ledger.rows_finished:
+                    if needed_len >= new_ledger.total_num_rows:
                         raise IndexError(
                             f"Index {needed_len} out of bounds for cache of size {new_ledger.total_num_rows}"
                         )
@@ -967,7 +994,9 @@ def _wait_for_len_sync(self, needed_len, timeout: Optional[float] = None):
                 if cur_time > t_max:
                     raise TimeoutError(f"Timed out waiting for cache to reach {needed_len}")
                 try:
-                    new_ledger = ray.get(self._broker.updated_ledger.remote(), timeout=max(t_max - cur_time, 10))
+                    new_ledger: CacheLedger = ray.get(
+                        self._broker.updated_ledger.remote(), timeout=max(t_max - cur_time, 10)
+                    )
                 except TimeoutError:
                     continue
 
@@ -975,7 +1004,7 @@ def _wait_for_len_sync(self, needed_len, timeout: Optional[float] = None):
                     break
 
                 if new_ledger.is_finished:
-                    if needed_len >= new_ledger.rows_finished:
+                    if needed_len >= new_ledger.total_num_rows:
                         raise IndexError(
                             f"Index {needed_len} out of bounds for cache of size {new_ledger.total_num_rows}"
                         )
diff --git a/tests/test_new_cache.py b/tests/test_new_cache.py
index 3302674de..b6132e548 100644
--- a/tests/test_new_cache.py
+++ b/tests/test_new_cache.py
@@ -216,6 +216,7 @@ async def test_batch_finished():
             batch_result = [np.array([1, 2, 3])]
 
             await writer.batch_finished.remote(shard_idx, shard_batch_idx, batch_result)
+            await writer.flush.remote()
             shard_status = await writer.get_shard_status.remote("shard1")
             assert shard_status.num_rows_committed == 1
         finally:
@@ -307,6 +308,8 @@ async def test_attempt_to_write_batches():
             await writer.batch_finished.remote("shard1", 0, shard1_batch)
             await writer.batch_finished.remote("shard2", 0, shard2_batch)
 
+            await writer.flush.remote()
+
             ledger = await writer.get_ledger.remote()
             assert ledger.is_finished is False
             assert ledger.total_num_rows == 2  # Assuming each batch has 1 row for simplicity
@@ -336,6 +339,7 @@ async def test_finalize_cache():
             await writer.shard_finished_reading.remote("shard1", 1)
             await writer.shard_finished_reading.remote("shard2", 1)
             await writer.batch_finished.remote("shard2", 0, shard2_batch)
+            await writer.flush.remote()
 
             ledger = await writer.get_ledger.remote()
             assert ledger.is_finished is False
@@ -390,6 +394,7 @@ async def test_out_of_order_batches_same_shard():
 
             await writer.batch_finished.remote("shard1", 1, shard1_batch1)
             await writer.batch_finished.remote("shard1", 0, shard1_batch0)
+            await writer.flush.remote()
 
             store = TreeStore.open(exemplar, cache_dir, mode="r")
             assert len(store) == 2
@@ -419,6 +424,7 @@ async def test_out_of_order_batches_different_shards():
             await writer.batch_finished.remote("shard1", 1, shard1_batch1)
             await writer.batch_finished.remote("shard2", 0, shard2_batch0)
             await writer.batch_finished.remote("shard1", 0, shard1_batch0)
+            await writer.flush.remote()
 
             store = TreeStore.open(exemplar, cache_dir, mode="r")
             assert len(store) == 3
@@ -451,6 +457,7 @@ async def test_batches_different_orders_all_shards():
             await writer.batch_finished.remote("shard3", 0, shard3_batch0)
             await writer.batch_finished.remote("shard1", 1, shard1_batch1)
             await writer.batch_finished.remote("shard1", 0, shard1_batch0)
+            await writer.flush.remote()
 
             store = TreeStore.open(exemplar, cache_dir, mode="r")
             assert len(store) == 4
@@ -486,6 +493,7 @@ async def test_intermixed_batches_same_and_different_shards():
             await writer.batch_finished.remote("shard1", 1, shard1_batch1)
             await writer.batch_finished.remote("shard2", 1, shard2_batch1)
             await writer.batch_finished.remote("shard1", 0, shard1_batch0)
+            await writer.flush.remote()
 
             store = TreeStore.open(exemplar, cache_dir, mode="r")
             assert len(store) == 5
@@ -512,6 +520,7 @@ async def test_duplicate_batches_same_shard():
             shard1_batch0 = [np.array([1, 2, 3])]
 
             await writer.batch_finished.remote("shard1", 0, shard1_batch0)
+            await writer.flush.remote()
             with pytest.raises(RayTaskError):
                 await writer.batch_finished.remote("shard1", 0, shard1_batch0)  # Duplicate
         finally:
@@ -544,6 +553,7 @@ async def test_mixed_order_batches_multiple_shards():
             await writer.batch_finished.remote("shard2", 1, shard2_batch1)
             await writer.batch_finished.remote("shard1", 0, shard1_batch0)
             await writer.batch_finished.remote("shard3", 1, shard3_batch1)
+            await writer.flush.remote()
 
             store = TreeStore.open(exemplar, cache_dir, mode="r")
             assert len(store) == 6
@@ -892,10 +902,12 @@ async def test_backpressure_mechanism():
         await writer.batch_finished.remote("shard1", 1, shard3_batch)
         await writer.batch_finished.remote("shard1", 2, shard3_batch)
         await writer.batch_finished.remote("shard1", 3, shard3_batch)
+        await writer.flush.remote()
 
         # Check if backpressure is signaled
         is_overwhelmed = await writer.is_overwhelmed.remote()
         assert is_overwhelmed is True
+        await writer.flush.remote()
 
         for i in range(4):
             if (await parent.desired_next_item.remote()) == 0:
@@ -910,6 +922,12 @@ async def test_backpressure_mechanism():
         # Reduce the queue size to relieve backpressure
         # Check if backpressure is relieved
         is_overwhelmed = await writer.is_overwhelmed.remote()
+        count = 0
+        while is_overwhelmed and count < 10:
+            await writer.flush.remote()
+            await asyncio.sleep(0.4)
+            is_overwhelmed = await writer.is_overwhelmed.remote()
+            count += 1
         assert is_overwhelmed is False
 
         for i in range(4):

From 43268e0ed60ee4b17ca91ff5fca1504375d95f4a Mon Sep 17 00:00:00 2001
From: Kamyar Salahi <kam.salahi@berkeley.edu>
Date: Wed, 25 Sep 2024 16:20:07 -0700
Subject: [PATCH 69/94] Adding supervised data config

---
 config/gpt2_small_fast_supervised.yaml  | 40 +++++++++++
 scripts/launch_gpt2_small_fast_tpu.sh   |  3 +-
 src/levanter/data/_preprocessor.py      |  4 +-
 src/levanter/data/sharded_datasource.py |  7 +-
 src/levanter/data/text.py               | 94 ++++++++++++++++++++++++-
 src/levanter/main/train_lm.py           | 20 +++++-
 6 files changed, 160 insertions(+), 8 deletions(-)
 create mode 100644 config/gpt2_small_fast_supervised.yaml

diff --git a/config/gpt2_small_fast_supervised.yaml b/config/gpt2_small_fast_supervised.yaml
new file mode 100644
index 000000000..0181a3fd4
--- /dev/null
+++ b/config/gpt2_small_fast_supervised.yaml
@@ -0,0 +1,40 @@
+data:
+  configs:
+    owt:
+      train_urls:
+        - "gs://pubmed-mosaic/openwebtext-sharded/openwebtext_train.{1..128}-of-128.jsonl.gz"
+      validation_urls:
+        - "gs://pubmed-mosaic/openwebtext-sharded/openwebtext_val.{1..8}-of-8.jsonl.gz"
+    wikitext:
+      id: dlwh/wikitext_103_detokenized
+  train_weights:
+    owt: 0.6
+    wikitext: 0.4
+  tokenizer: gpt2
+  cache_dir: "gs://levanter-data/tokenized/data_mix"
+supervised_data:
+  validation_urls:
+    - "gs://marin-us-central2/benchmarks/mmlu/mmlu-abstract_algebra-dev-evaluation.jsonl.gz"
+  cache_dir: "gs://marin-us-central2/benchmarks/tokenized/mmlu/"
+model:
+  type: gpt2
+  hidden_dim: 768
+  num_heads: 12
+  num_layers: 12
+  seq_len: 1024
+  gradient_checkpointing: true
+  scale_attn_by_inverse_layer_idx: true
+trainer:
+  tracker:
+    project: "levanter"
+    tags: [ "openwebtext+wiki", "gpt2", "itest"]
+
+  mp: p=f32,c=bfloat16
+  model_axis_size: 1
+
+  train_batch_size: 256
+  num_train_steps: 20000
+optimizer:
+  learning_rate: 1E-3
+  weight_decay: 0.1
+  warmup: 0.01
diff --git a/scripts/launch_gpt2_small_fast_tpu.sh b/scripts/launch_gpt2_small_fast_tpu.sh
index 7b2634749..342439041 100644
--- a/scripts/launch_gpt2_small_fast_tpu.sh
+++ b/scripts/launch_gpt2_small_fast_tpu.sh
@@ -2,5 +2,6 @@
 
 python infra/launch.py --foreground --tpu_name levanter-itest-32 --zone us-central2-b --tpu_type v4-32 --preemptible -- \
     python -m levanter.main.train_lm \
-    --config_path config/gpt2_small_fast.yaml \
+    --config_path config/gpt2_small_fast_supervised.yaml \
     --trainer.checkpointer.base_path gs://levanter-checkpoints/gpt-itest/ --trainer.checkpointer.save_interval 30m $*
+ 
\ No newline at end of file
diff --git a/src/levanter/data/_preprocessor.py b/src/levanter/data/_preprocessor.py
index 9ee1e2dc2..284243ec8 100644
--- a/src/levanter/data/_preprocessor.py
+++ b/src/levanter/data/_preprocessor.py
@@ -79,13 +79,15 @@ class _BatchMapTransform(_DatasetTransform):
     num_cpus: int
     num_gpus: int
     resources: dict
+    output_exemplar: Any
 
-    def __init__(self, fn, batch_size, num_cpus, num_gpus, resources):
+    def __init__(self, fn, batch_size, num_cpus, num_gpus, resources, output_exemplar = None):
         self.fn = fn
         self.batch_size = batch_size
         self.num_cpus = num_cpus
         self.num_gpus = num_gpus
         self.resources = resources
+        self.output_exemplar = output_exemplar
 
 
 def as_record_batch(doc: BatchResult) -> pa.RecordBatch:
diff --git a/src/levanter/data/sharded_datasource.py b/src/levanter/data/sharded_datasource.py
index 38682616d..74e0c8f3a 100644
--- a/src/levanter/data/sharded_datasource.py
+++ b/src/levanter/data/sharded_datasource.py
@@ -113,7 +113,7 @@ def map(self, fn: Callable[[T_co], U]) -> "ShardedDataSource[U]":
         return _MappedShardedDataSource(self, fn)
 
     def map_batches(
-        self, fn: Callable[[list[T_co]], BatchResult], batch_size, *, num_cpus=1, num_gpus=0, **resources
+        self, fn: Callable[[list[T_co]], BatchResult], batch_size, *, num_cpus=1, num_gpus=0, output_exemplar=None, **resources
     ) -> "ShardedDataSource[dict]":
         """
         **Lazily** map a function over batches of data. This is useful for doing things like batching data for a model,
@@ -131,7 +131,7 @@ def map_batches(
         Returns:
             A new ShardedDataset.
         """
-        return _BatchMappedShardedDataSource(self, fn, batch_size, num_cpus=num_cpus, num_gpus=num_gpus, **resources)
+        return _BatchMappedShardedDataSource(self, fn, batch_size, num_cpus=num_cpus, num_gpus=num_gpus, output_exemplar=output_exemplar, **resources)
 
 
 def datasource_from_hf(id: str, *, split, **kwargs) -> ShardedDataSource[dict]:
@@ -478,10 +478,11 @@ def __init__(
         batch_size,
         num_cpus=1,
         num_gpus=0,
+        output_exemplar=None,
         **resources,
     ):
         self.source = source
-        self._transform = _BatchMapTransform(fn, batch_size, num_cpus, num_gpus, resources)
+        self._transform = _BatchMapTransform(fn, batch_size, num_cpus, num_gpus, resources, output_exemplar=output_exemplar)
 
     @property
     def shard_names(self) -> Sequence[str]:
diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
index 20a11d090..feadd692d 100644
--- a/src/levanter/data/text.py
+++ b/src/levanter/data/text.py
@@ -583,6 +583,98 @@ def tagged_eval_sets(
 
         return [(eval_sets[name], tags[name]) for name in eval_sets]
 
+@dataclass
+class LMSupervisedDatasetConfig(LMDatasetSourceConfig):
+    """This class represents a dataset source with URLs or hf name/id."""
+
+    cache_dir: str = "cache/"
+
+    tags: Optional[List[str]] = None
+    """tags for the dataset. Typically the name of the dataset in the config will be added as a tag as well"""
+    name: Optional[str] = None  # name for hf dataset
+
+    validation_urls: List[str] = ()  # type:ignore
+
+    def token_seq_dataset(
+        self, split: str, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True
+    ) -> Optional[TokenSeqDataset]:
+        cache = self.build_or_load_cache(split, monitors=monitors)
+        if cache is None:
+            return None
+        return TokenSeqDataset(cache, seq_len)
+
+    def validation_set(
+        self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True
+    ) -> Optional[TokenSeqDataset]:
+        return self.token_seq_dataset("validation", seq_len, monitors)
+
+    def validation_sets(
+        self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True
+    ) -> Mapping[str, AsyncDataset[np.ndarray]]:
+        validation_set = self.validation_set(seq_len, monitors)
+        if validation_set is not None:
+            return {"": validation_set}
+        else:
+            return {}
+
+    # Add tagged eval set with split for auxiliary and validation dataset
+
+def mk_supervised_dataset(config: LMSupervisedDatasetConfig, tokenizer: PreTrainedTokenizerBase):
+    import levanter.data
+    dataset = levanter.data.datasource_from_jsonl(config.validation_urls)
+
+    def preprocess(batch):
+        sources = [example["input"] for example in batch]
+        targets = [f"{example['output']}{tokenizer.eos_token}" for example in batch]
+        # TODO: this seems pretty wasteful since you end up tokenizing twice, but it's how the original code does it.
+        examples = [s + t for s, t in zip(sources, targets)]
+        sources_tokenized = tokenizer(sources, padding=False, truncation=True)
+        examples_tokenized = tokenizer(examples, padding=False, truncation=True)
+
+        source_lens = [len(s) for s in sources_tokenized["input_ids"]]
+
+        return {
+            "input_ids": [np.array(example, dtype=np.int32) for example in examples_tokenized["input_ids"]],
+            "sources_len": np.array(source_lens, dtype=np.int32),
+        }
+
+    output_exemplar = {
+        "input_ids": np.zeros((0,), dtype=np.int32),
+        "sources_len": np.zeros((), dtype=np.int32)
+    }
+
+    dataset = dataset.map_batches(preprocess, batch_size=128, num_cpus=num_cpus_used_by_tokenizer(tokenizer), output_exemplar=output_exemplar)  # type: ignore
+    dataset = dataset.build_or_load_cache(config.cache_dir, await_finished=True)  # type: ignore
+
+    def _prepare_example(ex: dict) -> LmExample:
+        """
+        Prepare an example for training. This function converts the (cached) batch encoding into an LmExample.
+
+        It goes through the following steps:
+
+        1. Pad the batch to the maximum length.
+        2. Mask out the input and prompt if requested.
+        3. Create an LmExample with the input_ids as the input and the next token as the target.
+        """
+        # annoyingly, pad expects things to be batched so we have to prepend a batch axis
+        tokenizer.pad_token = tokenizer.eos_token
+        ex = tokenizer.pad({k: np.expand_dims(v, 0) for k, v in ex.items()}, return_tensors="np", padding="max_length")
+        ex = {k: v[0] for k, v in ex.items()}
+        input_ids = hax.named(ex["input_ids"], "position")
+        # mask out padding and anything before the start of the target
+        Pos = input_ids.resolve_axis("position")
+        # if config.mask_inputs:
+        #     loss_mask = hax.arange(Pos) >= ex["sources_len"]
+
+        #     # don't predict the padding
+        #     targets = hax.roll(input_ids, -1, Pos)
+        #     loss_mask = loss_mask & (targets != tokenizer.pad_token_id)
+        # else:
+        loss_mask = 1 - hax.nn.one_hot(-1, Pos, dtype=jax.numpy.float32)
+        lm_ex = LmExample.causal(input_ids, loss_mask=loss_mask)
+        return lm_ex
+
+    return dataset.map(_prepare_example)
 
 @dataclass
 class LMDatasetConfig(LMDatasetSourceConfig, LMTaskConfig):
@@ -828,4 +920,4 @@ def build_caches(
 
     @property
     def sources(self) -> dict[str, LMDatasetSourceConfig]:
-        return self.configs
+        return self.configs
\ No newline at end of file
diff --git a/src/levanter/main/train_lm.py b/src/levanter/main/train_lm.py
index 6c96f8b62..3166c91c9 100644
--- a/src/levanter/main/train_lm.py
+++ b/src/levanter/main/train_lm.py
@@ -16,7 +16,7 @@
 from levanter import callbacks
 from levanter.checkpoint import load_checkpoint
 from levanter.compat.hf_checkpoints import HFCompatConfig, save_hf_checkpoint_callback
-from levanter.data.text import CausalLmDataset, LMDatasetConfig, LMMixtureDatasetConfig
+from levanter.data.text import CausalLmDataset, LMDatasetConfig, LMMixtureDatasetConfig, LMSupervisedDatasetConfig
 from levanter.models.gpt2 import Gpt2Config
 from levanter.models.lm_model import LmConfig, compute_next_token_loss
 from levanter.optim import AdamConfig, OptimizerConfig
@@ -30,6 +30,7 @@
 @dataclass
 class TrainLmConfig:
     data: Union[LMDatasetConfig, LMMixtureDatasetConfig] = field(default_factory=LMDatasetConfig)
+    supervised_data: Optional[LMSupervisedDatasetConfig] = None
     trainer: TrainerConfig = field(default_factory=TrainerConfig)
     model: LmConfig = field(default_factory=Gpt2Config)
     optimizer: OptimizerConfig = field(default_factory=AdamConfig)
@@ -170,7 +171,6 @@ def main(config: TrainLmConfig):
                 (CausalLmDataset(ds, Pos, KeyPos, ignore_index=config.data.ignore_token_id), tags)
                 for ds, tags in tagged_eval_datasets
             ]
-
             cb = levanter.eval.cb_tagged_lm_evaluate(
                 EvalBatch,
                 causal_datasets,
@@ -182,6 +182,22 @@ def main(config: TrainLmConfig):
             )
             trainer.add_hook(cb, every=config.trainer.steps_per_eval)
 
+        if config.supervised_data is not None:
+            logger.info("Using supervised data")
+            supervised_eval = [(levanter.data.text.mk_supervised_dataset(config.supervised_data, tokenizer), "")]
+            # TODO Add tags
+            cb = levanter.eval.cb_tagged_lm_evaluate(
+                EvalBatch,
+                supervised_eval,
+                tokenizer,
+                trainer.device_mesh,
+                compute_axis_mapping,
+                max_eval_examples_per_ds,
+                prefix="internal_eval",
+                mp=config.trainer.mp,
+            )
+            trainer.add_hook(cb, every=config.trainer.steps_per_eval)
+
         flops_per_token = config.model.flops_per_token(vocab_size)
         flops_per_example = 3 * flops_per_token * Pos.size if flops_per_token is not None else None
         trainer.add_hook(

From d6ad71fc8f0eb1cd4325122e163820bdc60cf96d Mon Sep 17 00:00:00 2001
From: Kamyar Salahi <kam.salahi@berkeley.edu>
Date: Wed, 25 Sep 2024 16:24:22 -0700
Subject: [PATCH 70/94] Fixing linter error

---
 scripts/launch_gpt2_small_fast_tpu.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/launch_gpt2_small_fast_tpu.sh b/scripts/launch_gpt2_small_fast_tpu.sh
index 342439041..437491e01 100644
--- a/scripts/launch_gpt2_small_fast_tpu.sh
+++ b/scripts/launch_gpt2_small_fast_tpu.sh
@@ -4,4 +4,3 @@ python infra/launch.py --foreground --tpu_name levanter-itest-32 --zone us-centr
     python -m levanter.main.train_lm \
     --config_path config/gpt2_small_fast_supervised.yaml \
     --trainer.checkpointer.base_path gs://levanter-checkpoints/gpt-itest/ --trainer.checkpointer.save_interval 30m $*
- 
\ No newline at end of file

From 71bd6964ff4bb18551f0405b0ab74991b0555831 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Thu, 26 Sep 2024 15:53:42 -0700
Subject: [PATCH 71/94] Tweaks to Ray TPU stuff (#747)

1. num_tpus=1 is actually a bad idea because Ray will mask out the other
tpus
2. force non-docker workloads to run in a separate process for stability
---
 infra/cluster/job-cluster.yaml            |  23 +-
 infra/launch_on_ray.py                    |  61 ++---
 src/levanter/infra/ray_tpu.py             | 291 ++++++++++++++++------
 src/levanter/utils/background_iterable.py |   3 +-
 4 files changed, 254 insertions(+), 124 deletions(-)

diff --git a/infra/cluster/job-cluster.yaml b/infra/cluster/job-cluster.yaml
index 652771fcb..cf8703d54 100644
--- a/infra/cluster/job-cluster.yaml
+++ b/infra/cluster/job-cluster.yaml
@@ -47,10 +47,23 @@ available_node_types:
             sourceImage: projects/ubuntu-os-cloud/global/images/family/ubuntu-2204-lts
 
   # Worker Nodes =>>
+  tpu_slice_v4_8:
+    min_workers: 0
+    max_workers: 1024
+    resources: { "CPU": 120, "TPU": 4 }
+
+    node_config:
+      acceleratorType: v4-8
+      runtimeVersion: tpu-ubuntu2204-base
+
+      # [IMPORTANT] Configure all TPU Workers to be Preemptible!
+      schedulingConfig:
+        preemptible: true
+
   tpu_slice_v4_32:
     min_workers: 0
     max_workers: 1024
-    resources: { "CPU": 120, "TPU": 1 }
+    resources: { "CPU": 120, "TPU": 4 }
 
     node_config:
       acceleratorType: v4-32
@@ -63,7 +76,7 @@ available_node_types:
   tpu_slice_v4_64:
     min_workers: 0
     max_workers: 1024
-    resources: {"CPU": 120, "TPU": 1}
+    resources: {"CPU": 120, "TPU": 4}
 
     node_config:
       acceleratorType: v4-64
@@ -77,7 +90,7 @@ available_node_types:
   tpu_slice_v4_128:
     min_workers: 0
     max_workers: 1024
-    resources: { "CPU": 120, "TPU": 1 }
+    resources: { "CPU": 120, "TPU": 4 }
 
     node_config:
       acceleratorType: v4-128
@@ -90,7 +103,7 @@ available_node_types:
   tpu_slice_v4_256:
     min_workers: 0
     max_workers: 1024
-    resources: { "CPU": 120, "TPU": 1 }
+    resources: { "CPU": 120, "TPU": 4 }
 
     node_config:
       acceleratorType: v4-256
@@ -103,7 +116,7 @@ available_node_types:
   tpu_slice_v4_512:
     min_workers: 0
     max_workers: 1024
-    resources: { "CPU": 120, "TPU": 1 }
+    resources: { "CPU": 120, "TPU": 4 }
 
     node_config:
       acceleratorType: v4-512
diff --git a/infra/launch_on_ray.py b/infra/launch_on_ray.py
index 2040aff44..fa5e81f27 100755
--- a/infra/launch_on_ray.py
+++ b/infra/launch_on_ray.py
@@ -4,16 +4,15 @@
 import argparse
 import getpass
 import os
-import tempfile
 import time
 from pathlib import Path
 
-import draccus
 from ray.dashboard.modules.job.common import JobStatus
 from ray.dashboard.modules.job.sdk import JobSubmissionClient
 
 import levanter.infra.cli_helpers as cli
 import levanter.infra.docker as docker
+from levanter.infra import ray_tpu
 
 
 def main():
@@ -22,7 +21,7 @@ def main():
 
     cli.add_arg(parser, config, ["--docker_base_image"], default="ghcr.io/stanford-crfm/levanter-base:latest")
     cli.add_arg(parser, config, ["--docker_repository"], default="levanter")
-    cli.add_arg(parser, config, ["--address"], default="http://127.0.0.1:8265")
+    cli.add_arg(parser, config, ["--address"], default=None)
     cli.add_arg(parser, config, ["--image_name"], default=f"levanter-{getpass.getuser()}")
     cli.add_capacity_type_args(parser, config)
     cli.add_arg(parser, config, ["--project"], default=cli.gcloud_config()["project"])
@@ -112,19 +111,11 @@ def main():
     env["RUN_ID"] = run_id
     env["WANDB_DOCKER"] = full_image_id
 
-    # run_docker_on_pod(
-    #     full_image_id,
-    #     command=command,
-    #     tpu_type=tpu_type,
-    #     env=env,
-    #     retries=retries,
-    # )
-
     # Submit the job to the Ray cluster. We have to use the JobSubmissionClient to do this and stringify the arguments
     # we want:
-    from levanter.infra.ray_tpu import RunOnPodConfig
+    from levanter.infra.ray_tpu import RunDockerOnPodConfig
 
-    config = RunOnPodConfig(
+    config = RunDockerOnPodConfig(
         image_id=full_image_id,
         command=command,
         tpu_type=tpu_type,
@@ -133,26 +124,16 @@ def main():
         retries=retries,
     )
 
-    with tempfile.NamedTemporaryFile(suffix=".yaml", prefix=f"launch-{run_id}-", dir=".") as f:
-        yaml = draccus.dump(config)
-        f.write(yaml.encode("utf-8"))
-        f.flush()
-
-        f_name = os.path.relpath(f.name)
-        print(f"Submitting job with config path {f_name}")
-
-        client = JobSubmissionClient(args.address)
+    address = args.address or os.getenv("RAY_ADDRESS")
 
-        job_id = _make_unique_job_id(client, run_id)
-
-        job_id = client.submit_job(
-            entrypoint=f"python src/levanter/infra/ray_tpu.py --config_path {f_name}",
-            runtime_env={"working_dir": "./"},
-            job_id=job_id,
-        )
+    job_id = ray_tpu.submit_tpu_job_on_ray(
+        config,
+        ray_address=address,
+        run_id=run_id,
+    )
 
-        print(
-            f"""
+    print(
+        f"""
 -------------------------------------------------------
 Job '{job_id}' submitted successfully
 -------------------------------------------------------
@@ -165,9 +146,10 @@ def main():
   Request the job to be stopped:
     ray job stop {job_id}
 """
-        )
+    )
 
     if args.foreground:
+        client = JobSubmissionClient(address)
 
         async def tail_job(job_id):
             async for line in client.tail_job_logs(job_id):  # type: ignore
@@ -181,7 +163,6 @@ async def tail_job(job_id):
         wait_until_status(
             client, job_id, {JobStatus.RUNNING, JobStatus.FAILED, JobStatus.SUCCEEDED, JobStatus.STOPPED}
         )
-        # tail_job(job_id)
         import asyncio
 
         asyncio.run(tail_job(job_id))
@@ -196,19 +177,7 @@ def wait_until_status(client, job_id, status_to_wait_for, timeout_seconds=5):
             break
         time.sleep(1)
 
-
-# try to make the job id be the same as the run id, but if it already exists, just make it unique
-def _make_unique_job_id(client, run_id):
-    job_id = run_id
-    try:
-        while client.get_job_status(job_id) is not None:
-            job_id = f"{run_id}-{time.time_ns()}"
-    except Exception as e:  # noqa
-        if "does not exist" in str(e):
-            pass
-        else:
-            raise
-    return job_id
+    return status
 
 
 if __name__ == "__main__":
diff --git a/src/levanter/infra/ray_tpu.py b/src/levanter/infra/ray_tpu.py
index 69f25d02a..3ae5d0105 100644
--- a/src/levanter/infra/ray_tpu.py
+++ b/src/levanter/infra/ray_tpu.py
@@ -1,16 +1,23 @@
 import dataclasses
+import functools
 import logging
+import multiprocessing
 import os
 import subprocess
+import tempfile
+import time
 from dataclasses import dataclass
-from typing import Sequence
+from typing import Callable, Optional, Sequence
 
 import draccus
 import ray
+from ray._private.accelerators import TPUAcceleratorManager
+from ray.dashboard.modules.job.sdk import JobSubmissionClient
 from ray.exceptions import NodeDiedError, RayError, RaySystemError, RayTaskError, WorkerCrashedError
 from ray.remote_function import RemoteFunction
 
 from levanter.infra.cli_helpers import make_docker_run_command
+from levanter.utils.ray_utils import ser_exc_info
 
 
 # CF https://gist.github.com/allenwang28/e3400b9e9212b50aa1cda55ebeccea60
@@ -55,42 +62,61 @@ class TpuRunError(_TpuRunResult):
     error: Exception
 
 
-def run_on_pod(remote_fn: RemoteFunction, tpu_type: str):
+def run_on_pod(remote_fn: RemoteFunction | Callable, tpu_type: str) -> ray.ObjectRef:
     """
     Run a remote function on a TPU pod.
 
     Args:
         remote_fn: A remote function that takes no arguments
         tpu_type: The type of TPU to run on, e.g. "v4-32"
+
+    Returns:
+        A Ray ObjectRef that represents the result of the function
     """
 
     @ray.remote(resources={f"TPU-{tpu_type}-head": 1})
     def do_run(remote_fn) -> _TpuRunResult:
-        tpu_name = ray.util.accelerators.tpu.get_current_pod_name()  # -> my-tpu
         num_hosts = ray.util.accelerators.tpu.get_current_pod_worker_count()  # -> 4
-        remote_fn = remote_fn.options(resources={tpu_name: 1, "TPU": 1})
+        remote_fn, tpu_name = _redecorate_remote_fn_for_tpu(remote_fn, num_hosts)
 
         info = _TpuInfo(tpu_name, "ACTIVE", "TPU")
+        futures = [remote_fn.remote() for _ in range(num_hosts)]
         try:
-            try:
-                out = ray.get([remote_fn.remote() for _ in range(num_hosts)])
-                logger.info("TPU job finished")
-                return TpuSuccess(info, out)
-            except RayError as e:
-                return _handle_ray_error(info, e)
-        finally:
-            # remove the tpu lockfile on each host
-            logger.debug("Removing lockfiles")
-            _rm_lockfile = ray.remote(resources={tpu_name: 1, "TPU": 1})(_hacky_remove_tpu_lockfile)
-            try:
-                ray.get([_rm_lockfile.remote() for _ in range(num_hosts)])
-            except Exception:
-                logger.exception("Failed to remove lockfile")
-                # swallow the exception
+            out = ray.get(futures)
+            logger.info("TPU job finished")
+            return TpuSuccess(info, out)
+        except RayError as e:
+            for f in futures:
+                try:
+                    ray.cancel(f)
+                except Exception:
+                    logger.exception("Failed to kill job after primary failure")
+            return _handle_ray_error(info, e)
 
     return do_run.remote(remote_fn)
 
 
+def _redecorate_remote_fn_for_tpu(remote_fn, num_hosts):
+    """
+    Redecorate a remote function to run on a TPU pod.
+
+    Specifically, this function:
+
+    * Adds the TPU resources to the function
+    * forces the function to run in its own process to remove the TPU lockfile (and shutdown jax distributed)
+
+    """
+    remote_fn = _forkify_remote_fn(remote_fn)
+    if not isinstance(remote_fn, RemoteFunction):
+        remote_fn = ray.remote(remote_fn)
+
+    tpu_name = ray.util.accelerators.tpu.get_current_pod_name()  # -> my-tpu
+    num_tpus_per_host = TPUAcceleratorManager.get_current_node_num_accelerators()  # -> 8
+    remote_fn = remote_fn.options(resources={tpu_name: 1, "TPU": num_tpus_per_host})
+    logger.info(f"Running on TPU {tpu_name} with {num_hosts} hosts and {num_tpus_per_host} TPUs per host")
+    return remote_fn, tpu_name
+
+
 def run_on_pod_resumable(remote_fn, tpu_type, max_retries_preemption=1e6, max_retries_failure=10):
     """
     Repeatedly run a function on a TPU pod until it succeeds or a maximum number of retries is reached.
@@ -100,50 +126,63 @@ def run_on_pod_resumable(remote_fn, tpu_type, max_retries_preemption=1e6, max_re
         tpu_type: The type of TPU to run on, e.g. "v4-32"
         max_retries_preemption: The maximum number of times to retry if the job is preempted
         max_retries_failure: The maximum number of times to retry if the job fails
+
+    Returns:
+        The result of the function (not an ObjectRef)
+
     """
     num_failures = 0
     num_preemptions = 0
+    attempt = 0
+    problem: Exception | None = None
 
     while num_failures < max_retries_failure and num_preemptions < max_retries_preemption:
+        logger.info(f"Running on TPU {tpu_type}. Attempt {attempt}")
+        attempt += 1
+        problem = None
         try:
             out = ray.get(run_on_pod(remote_fn, tpu_type))
-            if isinstance(out, TpuSuccess):
-                result = out.result
-                logger.info("Success")
-                return result
-            elif isinstance(out, TpuPreempted):
-                e = out.error
-                num_preemptions += 1
-                print(f"Preempted {num_preemptions} times. {e}")
-                logger.warning(f"Preempted {num_preemptions} times. {e}", exc_info=e)
-            elif isinstance(out, TpuFailed):
-                num_preemptions += 1
-                logger.warning(f"TPU node failure. Treating as preempted: {num_preemptions} times")
-            elif isinstance(out, TpuRunError):
-                e = out.error
-                num_failures += 1
-                logger.warning(f"Failed {num_failures} times")
-                logger.exception(e)
-            else:
-                raise RuntimeError(f"Unexpected result: {out}")
         except ray.exceptions.RayTaskError as e:
+            problem = e
             if "preempted" in str(e):
                 num_preemptions += 1
                 logger.warning(f"Preempted {num_preemptions} times, {e}")
             else:
                 num_failures += 1
                 logger.warning(f"Failed {num_failures} times")
+            continue
         except Exception as e:
+            problem = e
             num_failures += 1
-            logger.warning(f"Failed {num_failures} times")
-            logger.exception(e)
             if num_failures >= max_retries_failure:
+                logger.exception("Failed too many times", exc_info=e)
                 raise e
+            else:
+                logger.warning(f"Failed {num_failures} times", exc_info=e)
+                continue
+
+        if isinstance(out, TpuSuccess):
+            result = out.result
+            logger.info("Success")
+            return result
+        elif isinstance(out, TpuPreempted):
+            problem = out.error
+            num_preemptions += 1
+            logger.warning(f"Preempted {num_preemptions} times. {problem}", exc_info=problem)
+        elif isinstance(out, TpuFailed):
+            num_preemptions += 1
+            logger.warning(f"TPU node failure. Treating as preempted: {num_preemptions} times")
+        elif isinstance(out, TpuRunError):
+            problem = out.error
+            num_failures += 1
+            logger.warning(f"Failed {num_failures} times", exc_info=problem)
+        else:
+            raise RuntimeError(f"Unexpected result: {out}")
 
-        if num_preemptions >= max_retries_preemption:
-            raise RuntimeError("Preempted too many times")
-        elif num_failures >= max_retries_failure:
-            raise RuntimeError("Failed too many times")
+    if num_preemptions >= max_retries_preemption:
+        raise RuntimeError("Preempted too many times") from problem
+    elif num_failures >= max_retries_failure:
+        raise RuntimeError("Failed too many times") from problem
 
 
 def _run_command(*args, **kwargs):
@@ -170,6 +209,7 @@ def run_docker():
 
 def _kill_old_container(name):
     try:
+        logger.info(f"Killing old container {name}")
         _run_command("sudo", "docker", "rm", "-f", name)
     except subprocess.CalledProcessError:
         pass
@@ -182,11 +222,9 @@ def _handle_ray_error(tpu_info: _TpuInfo, e: RayError):
     """
     # treat node failures as preemptions
     if isinstance(e, NodeDiedError):
-        print("Node died")
         logger.exception("Node died", exc_info=e)
         return TpuPreempted(tpu_info, e)
     elif isinstance(e, WorkerCrashedError):
-        print("Worker crashed")
         logger.exception("Worker crashed", exc_info=e)
         return TpuPreempted(tpu_info, e)
     elif isinstance(e, RaySystemError):
@@ -198,7 +236,6 @@ def _handle_ray_error(tpu_info: _TpuInfo, e: RayError):
         from levanter.infra.tpus import get_current_tpu_is_preempted
 
         if get_current_tpu_is_preempted():
-            print("Preempted")
             logger.exception("Preempted", exc_info=e)
             return TpuPreempted(tpu_info, e)
 
@@ -210,39 +247,70 @@ def _handle_ray_error(tpu_info: _TpuInfo, e: RayError):
         return TpuRunError(tpu_info, e)
 
 
-@dataclass
-class RunOnPodConfig:
-    image_id: str
-    command: list[str] | str
-    tpu_type: str
-    env: dict = dataclasses.field(default_factory=dict)
-    name: str = "levanter"
-    retries: int = 10
+def _forkify_remote_fn(remote_fn: RemoteFunction | Callable):
+    """
+    This is a bit of a hacky way to force a remote function to run in its own process, using multiprocessing.
 
+    There are a few issues we're trying to cover:
+
+    * libtpu only allows one process to access the TPU at a time, and it uses a lockfile to enforce this.
+    * Ray runs tasks in a long-running daemon, so the lockfile persists across tasks.
+    * jax.distributed likes to only be called once per process, even if you call shutdown
 
-@draccus.wrap()
-def main(args: RunOnPodConfig):
     """
-    Run a command on a TPU pod. This is a wrapper around `run_docker_on_pod` that takes a config object as a CLI.
+    if isinstance(remote_fn, RemoteFunction):
+        fn = remote_fn._function
+
+        @functools.wraps(fn)
+        def wrapped_fn(*args, **kwargs):
+            return _separate_process_fn(fn, args, kwargs)
+
+        # We need these arguments to be able to reconstruct the remote function
+        # def __init__(
+        #         self,
+        #         language,
+        #         function,
+        #         function_descriptor,
+        #         task_options,
+        # ):
+        remote_fn = RemoteFunction(
+            language=remote_fn._language,
+            function=wrapped_fn,
+            function_descriptor=remote_fn._function_descriptor,
+            task_options=remote_fn._default_options,
+        )
+        return remote_fn
+    else:
+        return functools.partial(_separate_process_fn, remote_fn)
 
-    We use this via infra/launch_on_ray.py to run docker containers on TPUs.
+
+def _separate_process_fn(underlying_function, args, kwargs):
+    """
+    Helper function for _forkify_remote_fn. This runs the function in a separate process.
     """
-    ray.init()
 
-    import shlex
+    def target_fn(queue, args, kwargs):
+        try:
+            # Call the original function
+            result = underlying_function(*args, **kwargs)
+            queue.put((True, result))  # Success, put the result
+        except Exception as e:
+            # Capture and return the full traceback in case of an exception
+            info = ser_exc_info(e)
+            queue.put((False, info))
 
-    if isinstance(args.command, str):
-        command = shlex.split(args.command)
-    else:
-        command = args.command
+    queue = multiprocessing.Queue()
+    process = multiprocessing.Process(target=target_fn, args=(queue, args, kwargs))
+    process.start()
+    process.join()
 
-    run_docker_on_pod(
-        args.image_id,
-        command,
-        tpu_type=args.tpu_type,
-        env=args.env,
-        name=args.name,
-    )
+    # Retrieve the result or error from the queue
+    success, value = queue.get()
+
+    if success:
+        return value
+    else:
+        value.reraise()
 
 
 def _hacky_remove_tpu_lockfile():
@@ -267,6 +335,85 @@ def _hacky_remove_tpu_lockfile():
             pass
 
 
+@dataclass
+class RunDockerOnPodConfig:
+    image_id: str
+    command: list[str] | str
+    tpu_type: str
+    env: dict = dataclasses.field(default_factory=dict)
+    name: str = "levanter"
+    retries: int = 10
+
+
+def submit_tpu_job_on_ray(config: RunDockerOnPodConfig, ray_address: str, run_id: Optional[str] = None):
+    """
+    Submit a job to run on a TPU pod on a Ray cluster. This programmatically submits a job to the Ray cluster.
+    This should be run on your local machine, not on the Ray cluster itself.
+
+    If run_id is not provided, a default run ID will be generated.
+    """
+
+    with tempfile.NamedTemporaryFile(suffix=".yaml", prefix=f"launch-{run_id}-", dir=".") as f:
+        yaml = draccus.dump(config)
+        f.write(yaml.encode("utf-8"))
+        f.flush()
+
+        f_name = os.path.relpath(f.name)
+        logger.info(f"Submitting job with config path {f_name}")
+
+        client = JobSubmissionClient(ray_address)
+
+        job_id = _make_unique_job_id(client, run_id) if run_id is not None else None
+
+        job_id = client.submit_job(
+            entrypoint=f"python -m levanter.infra.ray_tpu --config_path {f_name}",
+            runtime_env={"working_dir": ".", "env_vars": {"PYTHONPATH": "src:."}},
+            submission_id=job_id,
+        )
+
+        return job_id
+
+
+# try to make the job id be the same as the run id, but if it already exists, just make it unique
+def _make_unique_job_id(client, run_id):
+    job_id = run_id
+    try:
+        while client.get_job_status(job_id) is not None:
+            job_id = f"{run_id}-{time.time_ns()}"
+    except Exception as e:  # noqa
+        if "does not exist" in str(e):
+            pass
+        else:
+            raise
+    return job_id
+
+
+@draccus.wrap()
+def main(args: RunDockerOnPodConfig):
+    """
+    *This command is designed to run on a Ray cluster, not on your local machine. You probably want submit_tpu_job_on_ray.*
+
+    Run a command on a TPU pod. This is a wrapper around `run_docker_on_pod` that takes a config object as a CLI.
+
+    We use this via infra/launch_on_ray.py to run docker containers on TPUs.
+    """
+
+    import shlex
+
+    if isinstance(args.command, str):
+        command = shlex.split(args.command)
+    else:
+        command = args.command
+
+    run_docker_on_pod(
+        args.image_id,
+        command,
+        tpu_type=args.tpu_type,
+        env=args.env,
+        name=args.name,
+    )
+
+
 def _massage_env(env):
     # Ray pretends it's running in a TTY, which leads to a ton of log spam from tqdm.
     # Levanter uses tqdm_loggable, which tries to sniff out the TTY, but it doesn't work with Ray.
diff --git a/src/levanter/utils/background_iterable.py b/src/levanter/utils/background_iterable.py
index 4318b3f9b..11a80f8ec 100644
--- a/src/levanter/utils/background_iterable.py
+++ b/src/levanter/utils/background_iterable.py
@@ -82,7 +82,8 @@ def __del__(self):
 
     def stop(self, wait: bool = True):
         self._stop_event.set()
-        if self.thread is not None and wait:
+        # I'm getting an error that the thread is threading.current_thread(), which seems impossible
+        if self.thread is not None and wait and self.thread != threading.current_thread():
             self.thread.join()
 
     def _fill_queue_with_batches(self):

From f5b32cd3eed90227eda40de8ef4f53dbe6785b66 Mon Sep 17 00:00:00 2001
From: Kamyar Salahi <kam.salahi@berkeley.edu>
Date: Thu, 26 Sep 2024 23:23:05 -0700
Subject: [PATCH 72/94] Fixing supervised training

---
 config/gpt2_small_fast_supervised.yaml |   4 +-
 src/levanter/data/text.py              | 112 +++++++++++++------------
 tests/test_supervised.py               |  26 ++++++
 3 files changed, 86 insertions(+), 56 deletions(-)
 create mode 100644 tests/test_supervised.py

diff --git a/config/gpt2_small_fast_supervised.yaml b/config/gpt2_small_fast_supervised.yaml
index 0181a3fd4..56ce7ea36 100644
--- a/config/gpt2_small_fast_supervised.yaml
+++ b/config/gpt2_small_fast_supervised.yaml
@@ -14,8 +14,8 @@ data:
   cache_dir: "gs://levanter-data/tokenized/data_mix"
 supervised_data:
   validation_urls:
-    - "gs://marin-us-central2/benchmarks/mmlu/mmlu-abstract_algebra-dev-evaluation.jsonl.gz"
-  cache_dir: "gs://marin-us-central2/benchmarks/tokenized/mmlu/"
+    - "gs://marin-us-central2/benchmarks/mmlu/mmlu-*-dev-evaluation.jsonl.gz"
+  cache_dir: "gs://marin-us-central2/benchmarks/tokenized-gpt2/mmlu/"
 model:
   type: gpt2
   hidden_dim: 768
diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
index feadd692d..5a3dbce57 100644
--- a/src/levanter/data/text.py
+++ b/src/levanter/data/text.py
@@ -389,6 +389,20 @@ def num_gpus(self) -> int:
     def batch_size(self) -> int:
         return self._batch_size
 
+def fsspec_expand_glob(url):
+    expanded_urls = braceexpand.braceexpand(url)
+    for expanded_url in expanded_urls:
+        if "*" in expanded_url:
+            fs = fsspec.core.url_to_fs(expanded_url)[0]
+            globbed = fs.glob(expanded_url)
+            # have to append the fs prefix back on
+            protocol, _ = fsspec.core.split_protocol(expanded_url)
+            if protocol is None:
+                yield from globbed
+            else:
+                yield from [f"{protocol}://{path}" for path in globbed]
+        else:
+            yield expanded_url
 
 def concatenate_and_group_texts(
     encoding: BatchEncoding,
@@ -520,19 +534,7 @@ def urls_for_split(self, split):
         else:
             raise ValueError(f"Unknown split {split}")
 
-        def fsspec_expand_glob(url):
-            if "*" in url:
-                fs = fsspec.core.url_to_fs(url)[0]
-                globbed = fs.glob(url)
-                # have to append the fs prefix back on
-                protocol, _ = fsspec.core.split_protocol(url)
-                if protocol is None:
-                    return globbed
-                return [f"{protocol}://{path}" for path in globbed]
-            else:
-                return [url]
-
-        urls = [globbed for pat in urls for url in braceexpand.braceexpand(pat) for globbed in fsspec_expand_glob(url)]
+        urls = [globbed for url in urls for globbed in fsspec_expand_glob(url)]
         return urls
 
 
@@ -619,62 +621,64 @@ def validation_sets(
 
     # Add tagged eval set with split for auxiliary and validation dataset
 
-def mk_supervised_dataset(config: LMSupervisedDatasetConfig, tokenizer: PreTrainedTokenizerBase):
-    import levanter.data
-    dataset = levanter.data.datasource_from_jsonl(config.validation_urls)
-
-    def preprocess(batch):
-        sources = [example["input"] for example in batch]
-        targets = [f"{example['output']}{tokenizer.eos_token}" for example in batch]
-        # TODO: this seems pretty wasteful since you end up tokenizing twice, but it's how the original code does it.
-        examples = [s + t for s, t in zip(sources, targets)]
-        sources_tokenized = tokenizer(sources, padding=False, truncation=True)
-        examples_tokenized = tokenizer(examples, padding=False, truncation=True)
+def preprocess_supervised_example(batch, tokenizer: PreTrainedTokenizerBase):
+    sources = [example["input"] for example in batch]
+    
+    targets = [f"{example['output']}" for example in batch]
+    # TODO: this seems pretty wasteful since you end up tokenizing twice, but it's how the original code does it.
+    examples = [s + t for s, t in zip(sources, targets)]
+    sources_tokenized = tokenizer(sources, padding=False, truncation=True)
+    examples_tokenized = tokenizer(examples, padding=False, truncation=True)
 
-        source_lens = [len(s) for s in sources_tokenized["input_ids"]]
+    source_lens = [len(s) for s in sources_tokenized["input_ids"]]
 
-        return {
-            "input_ids": [np.array(example, dtype=np.int32) for example in examples_tokenized["input_ids"]],
-            "sources_len": np.array(source_lens, dtype=np.int32),
-        }
-
-    output_exemplar = {
-        "input_ids": np.zeros((0,), dtype=np.int32),
-        "sources_len": np.zeros((), dtype=np.int32)
+    return {
+        "input_ids": [np.array(example, dtype=np.int32) for example in examples_tokenized["input_ids"]],
+        "sources_len": np.array(source_lens, dtype=np.int32),
     }
 
-    dataset = dataset.map_batches(preprocess, batch_size=128, num_cpus=num_cpus_used_by_tokenizer(tokenizer), output_exemplar=output_exemplar)  # type: ignore
-    dataset = dataset.build_or_load_cache(config.cache_dir, await_finished=True)  # type: ignore
-
-    def _prepare_example(ex: dict) -> LmExample:
-        """
-        Prepare an example for training. This function converts the (cached) batch encoding into an LmExample.
+def _prepare_supervised_example(ex: dict, tokenizer: PreTrainedTokenizerBase) -> LmExample:
+    """
+    Prepare an example for training. This function converts the (cached) batch encoding into an LmExample.
 
-        It goes through the following steps:
+    It goes through the following steps:
 
-        1. Pad the batch to the maximum length.
-        2. Mask out the input and prompt if requested.
-        3. Create an LmExample with the input_ids as the input and the next token as the target.
-        """
+    1. Pad the batch to the maximum length.
+    2. Mask out the input and prompt if requested.
+    3. Create an LmExample with the input_ids as the input and the next token as the target.
+    """
+    with local_cpu_mesh():
         # annoyingly, pad expects things to be batched so we have to prepend a batch axis
-        tokenizer.pad_token = tokenizer.eos_token
         ex = tokenizer.pad({k: np.expand_dims(v, 0) for k, v in ex.items()}, return_tensors="np", padding="max_length")
         ex = {k: v[0] for k, v in ex.items()}
         input_ids = hax.named(ex["input_ids"], "position")
         # mask out padding and anything before the start of the target
         Pos = input_ids.resolve_axis("position")
-        # if config.mask_inputs:
-        #     loss_mask = hax.arange(Pos) >= ex["sources_len"]
-
-        #     # don't predict the padding
-        #     targets = hax.roll(input_ids, -1, Pos)
-        #     loss_mask = loss_mask & (targets != tokenizer.pad_token_id)
-        # else:
-        loss_mask = 1 - hax.nn.one_hot(-1, Pos, dtype=jax.numpy.float32)
+        loss_mask = hax.arange(Pos) >= ex["sources_len"] - 1
+        
+        # don't predict the padding
+        targets = hax.roll(input_ids, -1, Pos)
+        loss_mask = loss_mask & (targets != tokenizer.pad_token_id)
+        loss_mask = loss_mask & (1 - hax.nn.one_hot(-1, Pos, dtype=jax.numpy.bool_))
         lm_ex = LmExample.causal(input_ids, loss_mask=loss_mask)
         return lm_ex
 
-    return dataset.map(_prepare_example)
+def mk_supervised_dataset(config: LMSupervisedDatasetConfig, tokenizer: PreTrainedTokenizerBase):
+    import levanter.data
+    validation_urls = [url for url_pat in config.validation_urls for url in fsspec_expand_glob(url_pat)]
+    dataset = levanter.data.datasource_from_jsonl(validation_urls)
+
+    output_exemplar = {
+        "input_ids": np.zeros((0,), dtype=np.int32),
+        "sources_len": np.zeros((), dtype=np.int32)
+    }
+
+    dataset = dataset.map_batches(lambda ex: preprocess_supervised_example(ex, tokenizer), batch_size=128, num_cpus=num_cpus_used_by_tokenizer(tokenizer), output_exemplar=output_exemplar)  # type: ignore
+    dataset = dataset.build_or_load_cache(config.cache_dir, await_finished=True)  # type: ignore
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    return dataset.map(lambda ex: _prepare_supervised_example(ex, tokenizer))
 
 @dataclass
 class LMDatasetConfig(LMDatasetSourceConfig, LMTaskConfig):
diff --git a/tests/test_supervised.py b/tests/test_supervised.py
new file mode 100644
index 000000000..48856d585
--- /dev/null
+++ b/tests/test_supervised.py
@@ -0,0 +1,26 @@
+from levanter.data.text import preprocess_supervised_example, _prepare_supervised_example
+from transformers import AutoTokenizer
+import numpy as np
+import haliax
+
+def test_supervised_eval():
+    examples = [{"input": "Find all c in Z_3 such that Z_3[x]/(x^2 + c) is a field.\nA. 0\nB. 1\nC. 2\nD. 3\nAnswer:", "output": "B"}]
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    
+    output = preprocess_supervised_example(examples, tokenizer)
+    assert len(output["input_ids"][0]) == output["sources_len"][0] + 1
+    
+    ex = {'input_ids': np.array([16742,   477,   269,   287,  1168,    62,    18,   884,   326,
+        1168,    62,    18,    58,    87,    60, 29006,    87,    61,
+          17,  1343,   269,     8,   318,   257,  2214,    13,   198,
+          32,    13,   657,   198,    33,    13,   352,   198,    34,
+          13,   362,   198,    35,    13,   513,   198, 33706,    25,
+          33], dtype=np.int32), 'sources_len': np.array(45, dtype=np.int32)}
+    
+    lm_ex = _prepare_supervised_example(ex, tokenizer)
+    
+    assert(lm_ex.loss_mask['position', 44] != False)
+    assert(haliax.sum(lm_ex.loss_mask) == 1)
\ No newline at end of file

From 6483b42a8b0acba0c62ca80d8e16eced3aae4b62 Mon Sep 17 00:00:00 2001
From: Kamyar Salahi <kam.salahi@berkeley.edu>
Date: Thu, 26 Sep 2024 23:25:38 -0700
Subject: [PATCH 73/94] Making linter happy

---
 src/levanter/data/text.py |  4 ++--
 tests/test_supervised.py  | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
index 5a3dbce57..c89604488 100644
--- a/src/levanter/data/text.py
+++ b/src/levanter/data/text.py
@@ -623,7 +623,7 @@ def validation_sets(
 
 def preprocess_supervised_example(batch, tokenizer: PreTrainedTokenizerBase):
     sources = [example["input"] for example in batch]
-    
+
     targets = [f"{example['output']}" for example in batch]
     # TODO: this seems pretty wasteful since you end up tokenizing twice, but it's how the original code does it.
     examples = [s + t for s, t in zip(sources, targets)]
@@ -655,7 +655,7 @@ def _prepare_supervised_example(ex: dict, tokenizer: PreTrainedTokenizerBase) ->
         # mask out padding and anything before the start of the target
         Pos = input_ids.resolve_axis("position")
         loss_mask = hax.arange(Pos) >= ex["sources_len"] - 1
-        
+
         # don't predict the padding
         targets = hax.roll(input_ids, -1, Pos)
         loss_mask = loss_mask & (targets != tokenizer.pad_token_id)
diff --git a/tests/test_supervised.py b/tests/test_supervised.py
index 48856d585..49e38b4c4 100644
--- a/tests/test_supervised.py
+++ b/tests/test_supervised.py
@@ -6,21 +6,21 @@
 def test_supervised_eval():
     examples = [{"input": "Find all c in Z_3 such that Z_3[x]/(x^2 + c) is a field.\nA. 0\nB. 1\nC. 2\nD. 3\nAnswer:", "output": "B"}]
     tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    
+
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
-    
+
     output = preprocess_supervised_example(examples, tokenizer)
     assert len(output["input_ids"][0]) == output["sources_len"][0] + 1
-    
+
     ex = {'input_ids': np.array([16742,   477,   269,   287,  1168,    62,    18,   884,   326,
         1168,    62,    18,    58,    87,    60, 29006,    87,    61,
           17,  1343,   269,     8,   318,   257,  2214,    13,   198,
           32,    13,   657,   198,    33,    13,   352,   198,    34,
           13,   362,   198,    35,    13,   513,   198, 33706,    25,
           33], dtype=np.int32), 'sources_len': np.array(45, dtype=np.int32)}
-    
+
     lm_ex = _prepare_supervised_example(ex, tokenizer)
-    
+
     assert(lm_ex.loss_mask['position', 44] != False)
     assert(haliax.sum(lm_ex.loss_mask) == 1)
\ No newline at end of file

From 45d41d8d54712ede785dbecef77f48d4e7221e7f Mon Sep 17 00:00:00 2001
From: Kamyar Salahi <kam.salahi@berkeley.edu>
Date: Thu, 26 Sep 2024 23:47:24 -0700
Subject: [PATCH 74/94] Making linter happy

---
 src/levanter/data/_preprocessor.py      |  2 +-
 src/levanter/data/sharded_datasource.py | 17 +++++-
 src/levanter/data/text.py               | 57 +++++++++---------
 tests/test_supervised.py                | 78 +++++++++++++++++++++----
 4 files changed, 113 insertions(+), 41 deletions(-)

diff --git a/src/levanter/data/_preprocessor.py b/src/levanter/data/_preprocessor.py
index 284243ec8..170796fb6 100644
--- a/src/levanter/data/_preprocessor.py
+++ b/src/levanter/data/_preprocessor.py
@@ -81,7 +81,7 @@ class _BatchMapTransform(_DatasetTransform):
     resources: dict
     output_exemplar: Any
 
-    def __init__(self, fn, batch_size, num_cpus, num_gpus, resources, output_exemplar = None):
+    def __init__(self, fn, batch_size, num_cpus, num_gpus, resources, output_exemplar=None):
         self.fn = fn
         self.batch_size = batch_size
         self.num_cpus = num_cpus
diff --git a/src/levanter/data/sharded_datasource.py b/src/levanter/data/sharded_datasource.py
index 74e0c8f3a..6ebb15cc3 100644
--- a/src/levanter/data/sharded_datasource.py
+++ b/src/levanter/data/sharded_datasource.py
@@ -113,7 +113,14 @@ def map(self, fn: Callable[[T_co], U]) -> "ShardedDataSource[U]":
         return _MappedShardedDataSource(self, fn)
 
     def map_batches(
-        self, fn: Callable[[list[T_co]], BatchResult], batch_size, *, num_cpus=1, num_gpus=0, output_exemplar=None, **resources
+        self,
+        fn: Callable[[list[T_co]], BatchResult],
+        batch_size,
+        *,
+        num_cpus=1,
+        num_gpus=0,
+        output_exemplar=None,
+        **resources,
     ) -> "ShardedDataSource[dict]":
         """
         **Lazily** map a function over batches of data. This is useful for doing things like batching data for a model,
@@ -131,7 +138,9 @@ def map_batches(
         Returns:
             A new ShardedDataset.
         """
-        return _BatchMappedShardedDataSource(self, fn, batch_size, num_cpus=num_cpus, num_gpus=num_gpus, output_exemplar=output_exemplar, **resources)
+        return _BatchMappedShardedDataSource(
+            self, fn, batch_size, num_cpus=num_cpus, num_gpus=num_gpus, output_exemplar=output_exemplar, **resources
+        )
 
 
 def datasource_from_hf(id: str, *, split, **kwargs) -> ShardedDataSource[dict]:
@@ -482,7 +491,9 @@ def __init__(
         **resources,
     ):
         self.source = source
-        self._transform = _BatchMapTransform(fn, batch_size, num_cpus, num_gpus, resources, output_exemplar=output_exemplar)
+        self._transform = _BatchMapTransform(
+            fn, batch_size, num_cpus, num_gpus, resources, output_exemplar=output_exemplar
+        )
 
     @property
     def shard_names(self) -> Sequence[str]:
diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
index c89604488..664f067dd 100644
--- a/src/levanter/data/text.py
+++ b/src/levanter/data/text.py
@@ -389,6 +389,7 @@ def num_gpus(self) -> int:
     def batch_size(self) -> int:
         return self._batch_size
 
+
 def fsspec_expand_glob(url):
     expanded_urls = braceexpand.braceexpand(url)
     for expanded_url in expanded_urls:
@@ -404,6 +405,7 @@ def fsspec_expand_glob(url):
         else:
             yield expanded_url
 
+
 def concatenate_and_group_texts(
     encoding: BatchEncoding,
     seq_len: int,
@@ -585,6 +587,7 @@ def tagged_eval_sets(
 
         return [(eval_sets[name], tags[name]) for name in eval_sets]
 
+
 @dataclass
 class LMSupervisedDatasetConfig(LMDatasetSourceConfig):
     """This class represents a dataset source with URLs or hf name/id."""
@@ -597,30 +600,31 @@ class LMSupervisedDatasetConfig(LMDatasetSourceConfig):
 
     validation_urls: List[str] = ()  # type:ignore
 
-    def token_seq_dataset(
-        self, split: str, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True
-    ) -> Optional[TokenSeqDataset]:
-        cache = self.build_or_load_cache(split, monitors=monitors)
-        if cache is None:
-            return None
-        return TokenSeqDataset(cache, seq_len)
-
-    def validation_set(
-        self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True
-    ) -> Optional[TokenSeqDataset]:
-        return self.token_seq_dataset("validation", seq_len, monitors)
-
-    def validation_sets(
-        self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True
-    ) -> Mapping[str, AsyncDataset[np.ndarray]]:
-        validation_set = self.validation_set(seq_len, monitors)
-        if validation_set is not None:
-            return {"": validation_set}
-        else:
-            return {}
+    # def token_seq_dataset(
+    #     self, split: str, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True
+    # ) -> Optional[TokenSeqDataset]:
+    #     cache = self.build_or_load_cache(split, monitors=monitors)
+    #     if cache is None:
+    #         return None
+    #     return TokenSeqDataset(cache, seq_len)
+
+    # def validation_set(
+    #     self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True
+    # ) -> Optional[TokenSeqDataset]:
+    #     return self.token_seq_dataset("validation", seq_len, monitors)
+
+    # def validation_sets(
+    #     self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True
+    # ) -> Mapping[str, AsyncDataset[np.ndarray]]:
+    #     validation_set = self.validation_set(seq_len, monitors)
+    #     if validation_set is not None:
+    #         return {"": validation_set}
+    #     else:
+    #         return {}
 
     # Add tagged eval set with split for auxiliary and validation dataset
 
+
 def preprocess_supervised_example(batch, tokenizer: PreTrainedTokenizerBase):
     sources = [example["input"] for example in batch]
 
@@ -637,6 +641,7 @@ def preprocess_supervised_example(batch, tokenizer: PreTrainedTokenizerBase):
         "sources_len": np.array(source_lens, dtype=np.int32),
     }
 
+
 def _prepare_supervised_example(ex: dict, tokenizer: PreTrainedTokenizerBase) -> LmExample:
     """
     Prepare an example for training. This function converts the (cached) batch encoding into an LmExample.
@@ -663,15 +668,14 @@ def _prepare_supervised_example(ex: dict, tokenizer: PreTrainedTokenizerBase) ->
         lm_ex = LmExample.causal(input_ids, loss_mask=loss_mask)
         return lm_ex
 
+
 def mk_supervised_dataset(config: LMSupervisedDatasetConfig, tokenizer: PreTrainedTokenizerBase):
     import levanter.data
+
     validation_urls = [url for url_pat in config.validation_urls for url in fsspec_expand_glob(url_pat)]
     dataset = levanter.data.datasource_from_jsonl(validation_urls)
 
-    output_exemplar = {
-        "input_ids": np.zeros((0,), dtype=np.int32),
-        "sources_len": np.zeros((), dtype=np.int32)
-    }
+    output_exemplar = {"input_ids": np.zeros((0,), dtype=np.int32), "sources_len": np.zeros((), dtype=np.int32)}
 
     dataset = dataset.map_batches(lambda ex: preprocess_supervised_example(ex, tokenizer), batch_size=128, num_cpus=num_cpus_used_by_tokenizer(tokenizer), output_exemplar=output_exemplar)  # type: ignore
     dataset = dataset.build_or_load_cache(config.cache_dir, await_finished=True)  # type: ignore
@@ -680,6 +684,7 @@ def mk_supervised_dataset(config: LMSupervisedDatasetConfig, tokenizer: PreTrain
 
     return dataset.map(lambda ex: _prepare_supervised_example(ex, tokenizer))
 
+
 @dataclass
 class LMDatasetConfig(LMDatasetSourceConfig, LMTaskConfig):
     """This class supports loading data both from HF Datasets and from a raw dataset of jsonl urls"""
@@ -924,4 +929,4 @@ def build_caches(
 
     @property
     def sources(self) -> dict[str, LMDatasetSourceConfig]:
-        return self.configs
\ No newline at end of file
+        return self.configs
diff --git a/tests/test_supervised.py b/tests/test_supervised.py
index 49e38b4c4..e1d9098d2 100644
--- a/tests/test_supervised.py
+++ b/tests/test_supervised.py
@@ -1,10 +1,18 @@
-from levanter.data.text import preprocess_supervised_example, _prepare_supervised_example
-from transformers import AutoTokenizer
 import numpy as np
+from transformers import AutoTokenizer
+
 import haliax
 
+from levanter.data.text import _prepare_supervised_example, preprocess_supervised_example
+
+
 def test_supervised_eval():
-    examples = [{"input": "Find all c in Z_3 such that Z_3[x]/(x^2 + c) is a field.\nA. 0\nB. 1\nC. 2\nD. 3\nAnswer:", "output": "B"}]
+    examples = [
+        {
+            "input": "Find all c in Z_3 such that Z_3[x]/(x^2 + c) is a field.\nA. 0\nB. 1\nC. 2\nD. 3\nAnswer:",
+            "output": "B",
+        }
+    ]
     tokenizer = AutoTokenizer.from_pretrained("gpt2")
 
     if tokenizer.pad_token is None:
@@ -13,14 +21,62 @@ def test_supervised_eval():
     output = preprocess_supervised_example(examples, tokenizer)
     assert len(output["input_ids"][0]) == output["sources_len"][0] + 1
 
-    ex = {'input_ids': np.array([16742,   477,   269,   287,  1168,    62,    18,   884,   326,
-        1168,    62,    18,    58,    87,    60, 29006,    87,    61,
-          17,  1343,   269,     8,   318,   257,  2214,    13,   198,
-          32,    13,   657,   198,    33,    13,   352,   198,    34,
-          13,   362,   198,    35,    13,   513,   198, 33706,    25,
-          33], dtype=np.int32), 'sources_len': np.array(45, dtype=np.int32)}
+    ex = {
+        "input_ids": np.array(
+            [
+                16742,
+                477,
+                269,
+                287,
+                1168,
+                62,
+                18,
+                884,
+                326,
+                1168,
+                62,
+                18,
+                58,
+                87,
+                60,
+                29006,
+                87,
+                61,
+                17,
+                1343,
+                269,
+                8,
+                318,
+                257,
+                2214,
+                13,
+                198,
+                32,
+                13,
+                657,
+                198,
+                33,
+                13,
+                352,
+                198,
+                34,
+                13,
+                362,
+                198,
+                35,
+                13,
+                513,
+                198,
+                33706,
+                25,
+                33,
+            ],
+            dtype=np.int32,
+        ),
+        "sources_len": np.array(45, dtype=np.int32),
+    }
 
     lm_ex = _prepare_supervised_example(ex, tokenizer)
 
-    assert(lm_ex.loss_mask['position', 44] != False)
-    assert(haliax.sum(lm_ex.loss_mask) == 1)
\ No newline at end of file
+    assert lm_ex.loss_mask["position", 44]
+    assert haliax.sum(lm_ex.loss_mask) == 1

From b41838f35b6ecda8cb9dbdd3c408e14bcb75b0ad Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Thu, 3 Oct 2024 22:57:36 -0700
Subject: [PATCH 75/94] Simplify tokenization pipeline, make it work with large
 numbers of shards again, (re)add configuration metadata to cache (#752)

Co-authored-by: Ahmed Ahmed <ahmedah@stanford.edu>
---
 .dockerignore                         |    1 +
 config/data/dclm_gpt_neo.yaml         |   78 +
 config/data/dolma_olmo_paloma.yaml    |   44 +-
 config/llama_7b_with_dclm.yaml        |   33 +
 pyproject.toml                        |    5 +-
 src/levanter/data/_preprocessor.py    |   16 +-
 src/levanter/data/_queue.py           |  248 ----
 src/levanter/data/audio.py            |   41 +-
 src/levanter/data/text.py             |   58 +-
 src/levanter/main/train_asr.py        |    2 +-
 src/levanter/store/_prefetch_actor.py |  156 ++
 src/levanter/store/cache.py           | 1944 ++++++++++++-------------
 src/levanter/store/jagged_array.py    |   68 +-
 src/levanter/store/tree_store.py      |   18 +-
 src/levanter/utils/py_utils.py        |   35 +
 src/levanter/utils/ray_utils.py       |   42 +-
 tests/test_audio.py                   |   10 +
 tests/test_jagged_array.py            |   48 +-
 tests/test_new_cache.py               |  619 ++------
 tests/test_prefetch_actor.py          |  137 ++
 tests/test_tree_store.py              |   15 +-
 tests/test_utils.py                   |    6 +-
 22 files changed, 1762 insertions(+), 1862 deletions(-)
 create mode 100644 config/data/dclm_gpt_neo.yaml
 create mode 100644 config/llama_7b_with_dclm.yaml
 delete mode 100644 src/levanter/data/_queue.py
 create mode 100644 src/levanter/store/_prefetch_actor.py
 create mode 100644 tests/test_prefetch_actor.py

diff --git a/.dockerignore b/.dockerignore
index 45dfa95e6..9abaa045d 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -117,3 +117,4 @@ dmypy.json
 
 # local execution commands
 local_*.sh
+.aider*
diff --git a/config/data/dclm_gpt_neo.yaml b/config/data/dclm_gpt_neo.yaml
new file mode 100644
index 000000000..fd70a5d52
--- /dev/null
+++ b/config/data/dclm_gpt_neo.yaml
@@ -0,0 +1,78 @@
+cache_dir: "gs://marin-us-central2/tokenized/gpt_neox/"
+tokenizer: "EleutherAI/gpt-neox-20b"
+cache_options:
+  batch_size: 256
+  num_shard_groups: 1024
+stop_strategy: restart
+shuffle: 100000
+configs:
+  "dclm":
+    train_urls:
+      - gs://marin-us-central2/raw/dclm/v2024-07-09-baseline-dedup/**/*.zstd
+  # these are just for eval
+  "paloma/4chan":
+    validation_urls:
+      - gs://levanter-data/paloma/4chan_meta_sep/val/val*.jsonl.gz
+  "paloma/c4_100_domains":
+    validation_urls:
+      - gs://levanter-data/paloma/c4_100_domains/val/val*.jsonl.gz
+  "paloma/c4_en":
+    validation_urls:
+      - gs://levanter-data/paloma/c4_en/val/val*.jsonl.gz
+  "paloma/dolma-v1_5":
+    validation_urls:
+      - gs://levanter-data/paloma/dolma-v1_5/val/val*.jsonl.gz
+  "paloma/dolma_100_programing_languages":
+    validation_urls:
+      - gs://levanter-data/paloma/dolma_100_programing_languages/val/val*.jsonl.gz
+  "paloma/dolma_100_subreddits":
+    validation_urls:
+      - gs://levanter-data/paloma/dolma_100_subreddits/val/val*.jsonl.gz
+  "paloma/falcon-refinedweb":
+    validation_urls:
+      - gs://levanter-data/paloma/falcon-refinedweb/val/val*.jsonl.gz
+  "paloma/gab":
+    validation_urls:
+      - gs://levanter-data/paloma/gab/val/val*.jsonl.gz
+  "paloma/m2d2_s2orc_unsplit":
+    validation_urls:
+      - gs://levanter-data/paloma/m2d2_s2orc_unsplit/val/val*.jsonl.gz
+  "paloma/m2d2_wikipedia_unsplit":
+    validation_urls:
+      - gs://levanter-data/paloma/m2d2_wikipedia_unsplit/val/val*.jsonl.gz
+  "paloma/manosphere_meta_sep":
+    validation_urls:
+      - gs://levanter-data/paloma/manosphere_meta_sep/val/val*.jsonl.gz
+  "paloma/mc4":
+    validation_urls:
+      - gs://levanter-data/paloma/mc4/val/val*.jsonl.gz
+  "paloma/ptb":
+    validation_urls:
+      - gs://levanter-data/paloma/ptb/val/val*.jsonl.gz
+  "paloma/redpajama":
+    validation_urls:
+      - gs://levanter-data/paloma/redpajama/val/val*.jsonl.gz
+  "paloma/twitterAAE_HELM_fixed":
+    validation_urls:
+      - gs://levanter-data/paloma/twitterAAE_HELM_fixed/val/val*.jsonl.gz
+  "paloma/wikitext_103":
+    validation_urls:
+      - gs://levanter-data/paloma/wikitext_103/val/val*.jsonl.gz
+train_weights:
+  dclm: 1.0
+  paloma/4chan: 0.0
+  paloma/c4_100_domains: 0.0
+  paloma/c4_en: 0.0
+  paloma/dolma-v1_5: 0.0
+  paloma/dolma_100_programing_languages: 0.0
+  paloma/dolma_100_subreddits: 0.0
+  paloma/falcon-refinedweb: 0.0
+  paloma/gab: 0.0
+  paloma/m2d2_s2orc_unsplit: 0.0
+  paloma/m2d2_wikipedia_unsplit: 0.0
+  paloma/manosphere_meta_sep: 0.0
+  paloma/mc4: 0.0
+  paloma/ptb: 0.0
+  paloma/redpajama: 0.0
+  paloma/twitterAAE_HELM_fixed: 0.0
+  paloma/wikitext_103: 0.0
diff --git a/config/data/dolma_olmo_paloma.yaml b/config/data/dolma_olmo_paloma.yaml
index 54cbcd05f..6aefbdd47 100644
--- a/config/data/dolma_olmo_paloma.yaml
+++ b/config/data/dolma_olmo_paloma.yaml
@@ -1,59 +1,59 @@
-cache_dir: "gs://marin-data/tokenized/OLMo-1B/dolma-v1.7"
+cache_dir: "gs://marin-us-central2/tokenized/OLMo-1B/dolma/v1.7"
 tokenizer: "allenai/OLMo-1B"  # requires `pip install ai2-olmo`
 # tokenizer: "meta-llama/Llama-2-7b-hf"
 stop_strategy: restart
 configs:
   dolma-algebraic-stack:
     train_urls:
-      - gs://marin-data/raw/dolma/dolma-v1.7/algebraic-stack-train-{0000..0015}.json.gz
+      - gs://marin-us-central2/raw/dolma/v1.7/algebraic-stack-train-{0000..0015}.json.gz
   dolma-arxiv:
     train_urls:
-      - gs://marin-data/raw/dolma/dolma-v1.7/arxiv-{0000..0099}.json.gz
+      - gs://marin-us-central2/raw/dolma/v1.7/arxiv-{0000..0099}.json.gz
   dolma-gutenberg:
     train_urls:
-      - gs://marin-data/raw/dolma/dolma-v1.7/books-{0000..0002}.json.gz
+      - gs://marin-us-central2/raw/dolma/v1.7/books-{0000..0002}.json.gz
   dolma-c4:
     train_urls:
-      - gs://marin-data/raw/dolma/dolma-v1.7/c4-{0000..0170}.json.gz
+      - gs://marin-us-central2/raw/dolma/v1.7/c4-{0000..0170}.json.gz
   dolma-cc:
     train_urls:
-      - gs://marin-data/raw/dolma/dolma-v1.7/cc_en_head-{0000..0274}.json.gz
-      - gs://marin-data/raw/dolma/dolma-v1.7/cc_en_middle-{0000..0238}.json.gz  # 239 is missing
-      - gs://marin-data/raw/dolma/dolma-v1.7/cc_en_middle-{0240..0379}.json.gz
-      - gs://marin-data/raw/dolma/dolma-v1.7/cc_en_tail-{0000..0152}.json.gz    # 153 is missing
-      - gs://marin-data/raw/dolma/dolma-v1.7/cc_en_tail-{0154..0444}.json.gz
+      - gs://marin-us-central2/raw/dolma/v1.7/cc_en_head-{0000..0274}.json.gz
+      - gs://marin-us-central2/raw/dolma/v1.7/cc_en_middle-{0000..0238}.json.gz  # 239 is missing
+      - gs://marin-us-central2/raw/dolma/v1.7/cc_en_middle-{0240..0379}.json.gz
+      - gs://marin-us-central2/raw/dolma/v1.7/cc_en_tail-{0000..0152}.json.gz    # 153 is missing
+      - gs://marin-us-central2/raw/dolma/v1.7/cc_en_tail-{0154..0444}.json.gz
   dolma-cc-news:
     train_urls:
-      - gs://marin-data/raw/dolma/dolma-v1.7/cc_news_head-{0000..0004}.json.gz
-      - gs://marin-data/raw/dolma/dolma-v1.7/cc_news_middle-{0000..0002}.json.gz
-      - gs://marin-data/raw/dolma/dolma-v1.7/cc_news_tail-0000.json.gz
+      - gs://marin-us-central2/raw/dolma/v1.7/cc_news_head-{0000..0004}.json.gz
+      - gs://marin-us-central2/raw/dolma/v1.7/cc_news_middle-{0000..0002}.json.gz
+      - gs://marin-us-central2/raw/dolma/v1.7/cc_news_tail-0000.json.gz
   dolma-falcon:
     train_urls:
-      - gs://marin-data/raw/dolma/dolma-v1.7/falcon-{0000..0499}.json.gz
+      - gs://marin-us-central2/raw/dolma/v1.7/falcon-{0000..0499}.json.gz
   dolma-megawika:
     train_urls:
-      - gs://marin-data/raw/dolma/dolma-v1.7/megawika-{0000..0261}.json.gz
+      - gs://marin-us-central2/raw/dolma/v1.7/megawika-{0000..0261}.json.gz
   dolma-owmath:
     train_urls:
-      - gs://marin-data/raw/dolma/dolma-v1.7/open-web-math-train-{0000..0012}.json.gz
+      - gs://marin-us-central2/raw/dolma/v1.7/open-web-math-train-{0000..0012}.json.gz
   dolma-pes2o:
     train_urls:
-      - gs://marin-data/raw/dolma/dolma-v1.7/pes2o-{0000..0025}.json.gz
+      - gs://marin-us-central2/raw/dolma/v1.7/pes2o-{0000..0025}.json.gz
   dolma-reddit:
     train_urls:
-      - gs://marin-data/raw/dolma/dolma-v1.7/reddit-{0000..0077}.json.gz
+      - gs://marin-us-central2/raw/dolma/v1.7/reddit-{0000..0077}.json.gz
   dolma-stackexchange:
     train_urls:
-      - gs://marin-data/raw/dolma/dolma-v1.7/stackexchange-{0000..0025}.json.gz
+      - gs://marin-us-central2/raw/dolma/v1.7/stackexchange-{0000..0025}.json.gz
   dolma-starcoder:
     train_urls:
-      - gs://marin-data/raw/dolma/dolma-v1.7/starcoder-{0000..0048}.json.gz
+      - gs://marin-us-central2/raw/dolma/v1.7/starcoder-{0000..0048}.json.gz
   dolma-flan:
     train_urls:
-      - gs://marin-data/raw/dolma/dolma-v1.7/tulu_flan-{0000..0065}.json.gz
+      - gs://marin-us-central2/raw/dolma/v1.7/tulu_flan-{0000..0065}.json.gz
   dolma-wiki:
     train_urls:
-      - gs://marin-data/raw/dolma/dolma-v1.7/wiki-{0000..0001}.json.gz
+      - gs://marin-us-central2/raw/dolma/v1.7/wiki-{0000..0001}.json.gz
   # these are just for eval
   "paloma/4chan":
     validation_urls:
diff --git a/config/llama_7b_with_dclm.yaml b/config/llama_7b_with_dclm.yaml
new file mode 100644
index 000000000..980e64e41
--- /dev/null
+++ b/config/llama_7b_with_dclm.yaml
@@ -0,0 +1,33 @@
+data: !include data/dclm_gpt_neo.yaml
+model:  # 7B class model
+  type: llama
+  seq_len: 2048
+  hidden_dim: 4096
+  intermediate_dim: 11008
+  num_layers: 32
+  num_heads: 32
+  num_kv_heads: 32
+  use_flash_attention: True
+trainer:
+  tracker:
+    type: wandb
+    entity: "stanford-mercury"
+    project: "marin"
+    tags: ["dclm", "7B", "llama"]
+
+  mp: p=f32,c=bfloat16
+  train_batch_size: 2048
+  num_train_steps: 70000  # 280B / 4M
+  steps_per_eval: 1000
+  tensor_parallel_axes: ["mlp", "heads"]
+  fsdp_axis: "embed"
+  batch_axis: "batch"
+optimizer:
+  learning_rate: 4e-4
+  weight_decay: 0.1
+  min_lr_ratio: 0.1
+  beta1: 0.9
+  beta2: 0.95
+  warmup: 5000
+
+z_loss_weight: 5e-6
diff --git a/pyproject.toml b/pyproject.toml
index babf664e9..b0c3df90a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ dependencies = [
     "braceexpand>=0.1.7",
     "jmp>=0.0.3",
     "fsspec[http]>=2024.2,<2024.10",
-    "tensorstore==0.1.63",
+    "tensorstore>=0.1.65",
     "pytimeparse>=1.1.8",
     "humanfriendly==10.0",
     "safetensors[numpy]~=0.4.2",
@@ -50,7 +50,8 @@ dependencies = [
     "filelock~=3.13",
     #    "ai2-olmo",
     "async-lru~=2.0",
-    "tqdm-loggable>=0.2"
+    "tqdm-loggable>=0.2",
+    "deepdiff"
 ]
 
 [project.urls]
diff --git a/src/levanter/data/_preprocessor.py b/src/levanter/data/_preprocessor.py
index 9ee1e2dc2..09efb364d 100644
--- a/src/levanter/data/_preprocessor.py
+++ b/src/levanter/data/_preprocessor.py
@@ -27,10 +27,8 @@ class BatchProcessor(Generic[T_contra, U], ABC):
     @abstractmethod
     def __call__(self, batch: Sequence[T_contra]) -> Sequence[U] | U:  # U can be batched "structure of arrays" form
         """
-        Process a batch of data. You should return either a RecordBatch, a sequence of dicts (one per output
+        Process a batch of data. You should return a sequence of dicts (one per output
         example), or a dict of sequences (one per output field).
-
-        (We allow Mapping so that you can just return HF's BatchEncoding if you want.)
         """
         raise NotImplementedError
 
@@ -58,8 +56,10 @@ def num_gpus(self) -> int:
         return 0
 
     @property
-    def batch_size(self) -> int:
-        return 128
+    @abstractmethod
+    def metadata(self) -> Dict[str, Any]:
+        """Any metadata that changes the behavior of this processor."""
+        raise NotImplementedError
 
 
 class _DatasetTransform(ABC):
@@ -150,7 +150,7 @@ def rec(dataset):
 
 
 class _CompositeBatchProcessor(BatchProcessor):
-    def __init__(self, transforms, batch_size, num_cpus, num_gpus, resources):
+    def __init__(self, transforms, num_cpus, num_gpus, resources):
         self.transforms = transforms
         self._num_cpus = num_cpus
         self._num_gpus = num_gpus
@@ -207,6 +207,10 @@ def __call__(self, batch):
 
         return batch
 
+    @property
+    def metadata(self):
+        return {}
+
 
 def dict_from_record_batch(b) -> dict:
     # we follow the convention from hf batchencoding where homogeneous-lengthed arrays are turned into nd arrays
diff --git a/src/levanter/data/_queue.py b/src/levanter/data/_queue.py
deleted file mode 100644
index fd8f84860..000000000
--- a/src/levanter/data/_queue.py
+++ /dev/null
@@ -1,248 +0,0 @@
-import asyncio
-import dataclasses
-import heapq
-import logging as pylogging
-import threading
-import time
-from dataclasses import dataclass
-from queue import PriorityQueue
-from typing import List, Optional, Protocol, Sequence, TypeVar
-
-import ray
-from ray.actor import ActorHandle
-
-from levanter.utils.ray_utils import RefBox
-
-from ._preprocessor import BatchProcessor
-
-
-logger = pylogging.getLogger(__name__)
-
-T = TypeVar("T")
-U = TypeVar("U")
-LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-
-
-class PriorityWorkTaskGroupSpec(Protocol):
-    name: str
-
-    def build(self) -> "PriorityWorkTaskGroup":
-        raise NotImplementedError()
-
-
-class PriorityWorkTaskGroup(Protocol):
-    name: str
-    spec: PriorityWorkTaskGroupSpec
-
-    def items(self) -> Sequence["PriorityWorkItem"]:
-        raise NotImplementedError()
-
-
-class PriorityWorkItem(Protocol):
-    name: str
-    priority: float
-    spec: PriorityWorkTaskGroupSpec
-
-    def execute(self) -> tuple[bool, Optional[ray.ObjectRef]]:
-        """
-        Returns true if the item is finished, false if it should be rescheduled.
-        The object ref is used  (1) to block shutting down the actor too early
-        and (2) for backpressure.
-        """
-        raise NotImplementedError()
-
-    # needs to be sortable by priority
-    def __lt__(self, other: "PriorityWorkItem"):
-        if self.priority == other.priority:
-            return self.name < other.name
-        else:
-            return self.priority < other.priority
-
-    def __le__(self, other: "PriorityWorkItem"):
-        if self.priority == other.priority:
-            return self.name <= other.name
-        else:
-            return self.priority <= other.priority
-
-
-def _mk_queue_aware_process_task(processor: BatchProcessor[T, U], queue: ActorHandle):
-    @ray.remote(num_cpus=processor.num_cpus, num_gpus=processor.num_gpus, resources=processor.resources)
-    def process_task(desc, batch: List[T]):
-        # pylogging.basicConfig(level=DEFAULT_LOG_LEVEL, format=LOG_FORMAT)
-        logger.debug(f"Processing batch {desc}")
-        queue.task_running.remote()
-        try:
-            result = processor(batch)
-            logger.debug(f"Finished processing batch {desc}")
-            return result
-        except Exception as e:
-            logger.exception(f"Error while processing batch {desc}")
-            raise e
-        finally:
-            pass
-
-    return process_task
-
-
-@dataclass(order=True, frozen=True)
-class _QueueItem:
-    priority: float
-    desc: str
-    batch: ray.ObjectRef = dataclasses.field(compare=False)
-    task_id: int
-    task_future: asyncio.Future = dataclasses.field(compare=False)
-
-
-@ray.remote(num_cpus=0)
-class _BatchProcessorQueue:  # (Generic[T]): ray doesn't like generics
-    """
-    A queue of tasks to be processed by a BatchProcessor.
-
-    BatchProcessorQueue spins up tasks to process batches of data.
-    It spins up tasks until it reaches the maximum number of tasks that can be run in parallel.
-    It then waits for a task to finish before spinning up another one.
-    """
-
-    pqueue: PriorityQueue[_QueueItem]
-    processor: BatchProcessor
-    _next_task_id: int
-    ready: bool  # whether or not we can spin up a new task
-
-    @property
-    def batch_size(self):
-        return self.processor.batch_size
-
-    def __init__(self, batch_processor: BatchProcessor[T, U]):
-        self.pqueue = PriorityQueue()
-        self.processor = batch_processor
-        self._next_task_id = 0
-        self.ready = True  # whether we're ready to ask ray to start a new task
-        self_ref = ray.runtime_context.get_runtime_context().current_actor
-        self._task_processor = _mk_queue_aware_process_task(batch_processor, self_ref)
-
-    # we don't need/want to dereference the batch, so we wrap it in a RefBox
-    # one virtue of doing things this way is that we can let Ray try to schedule the compute near the data.
-    async def submit(self, priority: float, desc: str, batch: RefBox):
-        """Returns a future that is set to the *ObjectRef* of the processed batch. The future is "complete" when the task
-        starts, not when it finishes. You then call ray.get on the future's result to get the actual batch."""
-        task_id = self._next_task_id
-        self._next_task_id += 1
-        f: asyncio.Future = asyncio.Future()
-        self.pqueue.put(_QueueItem(priority, desc, batch.ref, task_id, f))
-        self._maybe_start_task()
-        return await f
-
-    def _maybe_start_task(self):
-        if self.ready and not self.pqueue.empty():
-            self.ready = False
-            item = self.pqueue.get()
-            batch = item.batch
-            try:
-                item.task_future.set_result(self._task_processor.remote(item.desc, batch))
-            except Exception as e:
-                item.task_future.set_exception(e)
-
-    def task_running(self):
-        self.ready = True
-        self._maybe_start_task()
-
-
-@ray.remote(num_cpus=0.5, scheduling_strategy="SPREAD")
-class WorkQueueDispatcherActor:
-    def __init__(self, max_in_flight: Optional[int] = 200):
-        pylogging.basicConfig(level=pylogging.INFO, format=LOG_FORMAT)
-        self._queue: list[PriorityWorkItem] = []  # heapq
-        self._queue_lock = threading.Lock()
-        self._shutdown_event = threading.Event()
-        self._current_item: Optional[PriorityWorkItem] = None
-        self._max_in_flight = max_in_flight
-
-        self._max_priority: Optional[float] = None
-        self._processing_thread = threading.Thread(target=self._loop, daemon=True)
-        self._processing_thread.start()
-
-    def set_max_dispatch_priority(self, max_priority: Optional[float]):
-        """
-        When the sink is full, we will not dispatch items with a priority higher than this.
-        """
-        with self._queue_lock:
-            self._max_priority = max_priority
-
-    def assign_work(self, group: PriorityWorkTaskGroupSpec):
-        items = group.build().items()
-        with self._queue_lock:
-            for item in items:
-                heapq.heappush(self._queue, item)
-
-    def is_group_finished(self, group: PriorityWorkTaskGroupSpec):
-        with self._queue_lock:
-            if any(item.spec == group for item in self._queue):
-                return False
-
-            if self._current_item is not None and self._current_item.spec == group:
-                return False
-
-            logger.debug(f"Group {group.name} is finished.")
-
-            return True
-
-    def cancel_work_group(self, group: PriorityWorkTaskGroupSpec):
-        # kill all the items in the group
-        with self._queue_lock:
-            self._queue = [item for item in self._queue if item.spec != group]
-            heapq.heapify(self._queue)
-
-    def shutdown(self):
-        if not self._shutdown_event.is_set():
-            self._shutdown_event.set()
-
-            if self._processing_thread.is_alive():
-                self._processing_thread.join()
-
-    def _loop(self: "WorkQueueDispatcherActor"):
-        should_sleep = False
-        backpressure_queue: list[ray.ObjectRef] = []
-
-        def drain_backpressure_to(count):
-            nonlocal backpressure_queue
-            while len(backpressure_queue) > count:
-                finished, remaining = ray.wait(backpressure_queue, num_returns=1, fetch_local=False)
-                backpressure_queue = remaining
-
-        while not self._shutdown_event.is_set():
-            if should_sleep:
-                time.sleep(0.1)
-
-            drain_backpressure_to(self._max_in_flight)
-
-            with self._queue_lock:
-                if len(self._queue) == 0:
-                    should_sleep = True
-                    continue
-                else:
-                    should_sleep = False
-
-                item = heapq.heappop(self._queue)
-                if self._max_priority is not None and item.priority > self._max_priority:
-                    logger.debug(f"Item {item.name} has priority {item.priority} which is too high. Rescheduling.")
-                    heapq.heappush(self._queue, item)
-                    continue
-                self._current_item = item
-
-            try:
-                item_is_finished, ref = item.execute()
-                if ref is not None:
-                    backpressure_queue.append(ref)
-            except Exception:
-                logger.exception(f"Error while processing {item.name}. Killing all associated work.")
-                self.cancel_work_group(item.spec)
-                continue
-
-            with self._queue_lock:
-                self._current_item = None
-                if not item_is_finished:
-                    heapq.heappush(self._queue, item)
-
-        logger.debug("Shutting down PriorityProcessorActor. Waiting for backpressure to drain.")
-        drain_backpressure_to(0)
-        logger.debug("Backpressure drained. Shutting down PriorityProcessorActor.")
diff --git a/src/levanter/data/audio.py b/src/levanter/data/audio.py
index d04479a24..f8a193f04 100644
--- a/src/levanter/data/audio.py
+++ b/src/levanter/data/audio.py
@@ -4,7 +4,7 @@
 import os
 from dataclasses import dataclass
 from functools import cached_property
-from typing import Iterator, List, Mapping, Optional, Sequence, Tuple, Union
+from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple, Union
 
 import braceexpand
 import datasets
@@ -29,7 +29,7 @@
 # intercept the logging nonsense here
 from levanter.logging import silence_transformer_nag
 from levanter.models.asr_model import AudioTextExample
-from levanter.store.cache import TreeCache, build_or_load_cache
+from levanter.store.cache import CacheOptions, TreeCache, build_or_load_cache
 from levanter.utils.jax_utils import local_cpu_mesh
 
 
@@ -73,7 +73,6 @@ def __init__(
         enforce_bos=True,
         enforce_eos=True,
         *,
-        batch_size=128,
         override_resources=None,
         max_length=448,
         padding=True,
@@ -83,7 +82,6 @@ def __init__(
             tokenizer,
             enforce_bos=enforce_bos,
             enforce_eos=enforce_eos,
-            batch_size=batch_size,
             override_resources=override_resources,
             return_attention_mask=True,
             padding="max_length" if padding else False,
@@ -91,7 +89,6 @@ def __init__(
         )
 
         self.override_resources = override_resources
-        self._batch_size = batch_size
 
     def __call__(self, batch: Sequence[Tuple[np.ndarray, int, str]]) -> Sequence[AudioTextDict]:
         """
@@ -123,6 +120,13 @@ def __call__(self, batch: Sequence[Tuple[np.ndarray, int, str]]) -> Sequence[Aud
 
         return out  # type: ignore
 
+    @property
+    def metadata(self) -> Dict[str, Any]:
+        return {
+            "tokenizer": self.bt.metadata,
+            "processor": self.feature_extractor.to_dict(),
+        }
+
     @property
     def output_exemplar(self):
         return AudioTextDict_exemplar
@@ -136,10 +140,6 @@ def num_cpus(self) -> int:
     def num_gpus(self) -> int:
         return self.bt.num_gpus
 
-    @property
-    def batch_size(self) -> int:
-        return self.bt._batch_size
-
 
 @dataclass
 class AudioDatasetSourceConfig:
@@ -247,8 +247,10 @@ def the_feature_extractor(self) -> SequenceFeatureExtractor:
 
     @abc.abstractmethod
     def train_set(
-        self, batch_size: int, monitors: Union[bool, List[MetricsMonitor]] = True
-    ) -> AsyncDataset[np.ndarray]:
+        self,
+        monitors: Union[bool, List[MetricsMonitor]] = True,
+        options: CacheOptions = CacheOptions.default(),
+    ) -> AsyncDataset[AudioTextDict]:
         pass
 
     @abc.abstractmethod
@@ -294,18 +296,17 @@ def build_or_load(
         tokenizer: PreTrainedTokenizerBase,
         enforce_bos=True,
         enforce_eos=True,
-        batch_size=128,
         monitors=None,
         await_finished=True,
         override_resources=None,
         max_length=448,
+        cache_options: CacheOptions = CacheOptions.default(),
     ) -> "ProcessedAudioCache":
         bp = BatchAudioProcessor(
             processor,
             tokenizer,
             enforce_bos=enforce_bos,
             enforce_eos=enforce_eos,
-            batch_size=batch_size,
             override_resources=override_resources,
             max_length=max_length,
         )
@@ -316,6 +317,7 @@ def build_or_load(
             bp,
             await_finished=await_finished,
             monitors=monitors,
+            options=cache_options,
         )
         if cache.is_finished:
             logger.info(f"Cache {cache_dir} is complete.")
@@ -339,7 +341,8 @@ def load(cache_dir):
         """
 
         try:
-            cache = TreeCache.load(cache_dir, AudioTextDict_exemplar)
+            # TODO: populate cache config
+            cache = TreeCache.load(cache_dir, AudioTextDict_exemplar, options=None)
             return ProcessedAudioCache(cache)
         except FileNotFoundError:
             raise FileNotFoundError(f"{cache_dir} is not a complete cache")
@@ -352,8 +355,10 @@ def load(cache_dir):
 class AudioIODatasetConfig(AudioDatasetSourceConfig, AudioTaskConfig):
     """This class supports loading data both from HF Datasets and from a raw dataset of jsonl urls"""
 
-    def train_set(self, batch_size: int, monitors: Union[bool, List[MetricsMonitor]] = True) -> ProcessedAudioCache:
-        ds = self.build_or_load_cache(self.train_split, batch_size=batch_size, monitors=monitors)
+    def train_set(
+        self, monitors: Union[bool, List[MetricsMonitor]] = True, options: CacheOptions = CacheOptions.default()
+    ) -> ProcessedAudioCache:
+        ds = self.build_or_load_cache(self.train_split, monitors=monitors)
         if ds is None:
             raise ValueError("No training set!")
         return ds
@@ -388,9 +393,9 @@ def _has_validation_set(self):
     def build_or_load_cache(
         self,
         split: str,
-        batch_size: int = 128,
         monitors: Union[bool, List[MetricsMonitor]] = True,
         logger_name: Optional[str] = None,
+        cache_options: CacheOptions = CacheOptions.default(),
     ) -> Optional[ProcessedAudioCache]:
         split_cache_dir = os.path.join(self.cache_dir, split)
         name = logger_name or os.path.basename(self.cache_dir)
@@ -422,10 +427,10 @@ def build_or_load_cache(
             self.the_tokenizer,
             enforce_bos=self.enforce_bos,
             enforce_eos=self.enforce_eos,
-            batch_size=batch_size,
             monitors=monitors,
             await_finished=(split == "validation"),
             max_length=self.max_length,
+            cache_options=cache_options,
         )
 
 
diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
index 20a11d090..62dfb62ba 100644
--- a/src/levanter/data/text.py
+++ b/src/levanter/data/text.py
@@ -8,7 +8,7 @@
 from dataclasses import dataclass
 from functools import cached_property
 from itertools import chain
-from typing import Dict, Iterator, List, Mapping, Optional, Sequence, Tuple, TypeVar, Union
+from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple, TypeVar, Union
 
 import braceexpand
 import datasets
@@ -34,7 +34,7 @@
 from levanter.logging import silence_transformer_nag  # noqa
 from levanter.models.attention import AttentionMask
 from levanter.models.lm_model import LmExample
-from levanter.store.cache import TreeCache
+from levanter.store.cache import CacheOptions, TreeCache
 from levanter.store.jagged_array import JaggedArrayStore
 from levanter.store.tree_store import TreeStore
 from levanter.utils.hf_utils import num_cpus_used_by_tokenizer
@@ -114,23 +114,6 @@ async def get_batch(self, indices: Sequence[int]) -> Sequence[T_co]:
         out = await asyncio.gather(*out)
         return out
 
-    def get_batch_sync(self, indices: Sequence[int]) -> Sequence[T_co]:
-        token_arrays = self.doc_cache.store.tree["input_ids"]
-        # logger.info(f"Time to get token cache: {time.time() - time_in}")
-        # len = await self.wait_until_len_at_least(max(indices) + 1)
-        # if len is not None and len < max(indices) + 1:
-        # raise ValueError("Requested indices beyond the end of the dataset")
-        offsets = np.array(indices) * self.seq_len
-        with ts.Batch():
-            out = []
-            for offset in offsets:
-                out.append(token_arrays.data[offset : offset + self.seq_len].read())
-        # logger.info(f"Time to read token cache: {time.time() - time_in}")
-
-        out = [x.result() for x in out]
-        # logger.info(f"Time to wait for token cache: {time.time() - time_in}")
-        return out
-
     async def wait_until_len_at_least(self, length: int) -> int:
         # length is brutally slow to compute, so we cache it
         if self._cached_len is not None and self._cached_len >= length:
@@ -213,7 +196,6 @@ def __init__(
         enforce_bos=True,
         enforce_eos=True,
         *,
-        batch_size=128,
         override_resources=None,
         _workaround_len=LONG_STRING_WORKAROUND,
         return_attention_mask=False,
@@ -247,8 +229,6 @@ def __init__(
             should_append_eos = False
             should_append_bos = False
 
-        self._batch_size = batch_size
-
         self._need_to_add_eos = should_append_eos
         self._need_to_add_bos = should_append_bos
         self._workaround_len = _workaround_len
@@ -306,6 +286,18 @@ def _break_for_long_sequences(self, batch):
             batch.append(d)
         return batch, needs_merge
 
+    @property
+    def metadata(self) -> Dict[str, Any]:
+        return {
+            "tokenizer": self.tokenizer.name_or_path,
+            "vocab_size": len(self.tokenizer),
+            "return_attention_mask": self.return_attention_mask,
+            "padding": self.padding,
+            "max_length": self.max_length,
+            "append_bos": self._need_to_add_bos,
+            "append_eos": self._need_to_add_eos,
+        }
+
     @property
     def output_exemplar(self) -> dict:
         return dict(**self.tokenizer("hi there", return_attention_mask=self.return_attention_mask, verbose=False))
@@ -385,10 +377,6 @@ def num_gpus(self) -> int:
             return self.override_resources.get("num_gpus", 0)
         return 0
 
-    @property
-    def batch_size(self) -> int:
-        return self._batch_size
-
 
 def concatenate_and_group_texts(
     encoding: BatchEncoding,
@@ -543,7 +531,7 @@ class LMTaskConfig(abc.ABC):
 
     # config related to caching
     cache_dir: str = "cache/"
-    tokenizer_batch_size: int = 32
+    cache_options: CacheOptions = field(default_factory=CacheOptions)
     enforce_eos: bool = True  # whether to append eos even if the tokenizer doesn't
 
     ignore_token_id: Optional[int] = None
@@ -650,6 +638,7 @@ def build_or_load_cache(
         name = logger_name or os.path.basename(self.cache_dir)
 
         try:
+            # TODO: pass in options
             return TreeCache.load(split_cache_dir, exemplar={"input_ids": np.zeros(0, dtype=np.int32)})
         except FileNotFoundError:
             pass
@@ -669,20 +658,15 @@ def build_or_load_cache(
         elif monitors is False:
             monitors = []
 
-        bt = BatchTokenizer(
-            self.the_tokenizer, enforce_bos=True, enforce_eos=self.enforce_eos, batch_size=self.tokenizer_batch_size
-        )
+        bt = BatchTokenizer(self.the_tokenizer, enforce_bos=True, enforce_eos=self.enforce_eos)
 
         return build_or_load_cache(
             split_cache_dir,
             source,
             bt,
-            await_finished=False,
             monitors=monitors,
-            cache_config={
-                "tokenizer": self.the_tokenizer.name_or_path,
-                "vocab_size": self.the_tokenizer.vocab_size,
-            },
+            await_finished=False,
+            options=self.cache_options,
         )
 
 
@@ -820,10 +804,12 @@ def build_caches(
 
         # in practice it works best if we block on validation caches
         if split == "validation":
-            logger.info("Waiting for validation caches to finish building...")
             for cache in caches.values():
                 cache.await_finished()
 
+        else:
+            logger.info(f"Not waiting for {split} caches to finish building")
+
         return caches
 
     @property
diff --git a/src/levanter/main/train_asr.py b/src/levanter/main/train_asr.py
index 72e6d5adb..681a806a6 100644
--- a/src/levanter/main/train_asr.py
+++ b/src/levanter/main/train_asr.py
@@ -115,7 +115,7 @@ def compute_loss(
 
         eval_datasets = config.data.validation_sets()
         train_dataset = AudioTextDataset(
-            config.data.train_set(config.batch_size),
+            config.data.train_set(),
             Pos,
             [config.model.Mels, config.model.MelPos],
             KeyPos,
diff --git a/src/levanter/store/_prefetch_actor.py b/src/levanter/store/_prefetch_actor.py
new file mode 100644
index 000000000..6b3c302c2
--- /dev/null
+++ b/src/levanter/store/_prefetch_actor.py
@@ -0,0 +1,156 @@
+import asyncio
+import logging
+from dataclasses import dataclass
+from queue import Empty as QueueEmpty
+from typing import Callable, Generic, Iterator, List, Optional, TypeVar
+
+import ray
+
+from levanter.utils.ray_utils import ExceptionInfo, ser_exc_info
+
+
+T = TypeVar("T")
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class _PrefetchException:
+    info: ExceptionInfo
+
+
+class _Sentinel:
+    pass
+
+
+_SENTINEL = _Sentinel()
+
+
+class RayPrefetchQueue(Generic[T]):
+    def __init__(
+        self, producer: Callable[[], Iterator[T]], max_queue_size: int = 100, producer_options: dict | None = None
+    ):
+        self.max_queue_size = max_queue_size
+        if producer_options is None:
+            producer_options = {}
+        self.queue_actor = _QueueActor.remote(max_queue_size)  # type: ignore
+        self.producer_task = _run_producer.options(**producer_options).remote(self.queue_actor, producer)
+        self._stopped = False
+        self._finished = False
+
+    def queue_size(self):
+        return ray.get(self.queue_actor.qsize.remote())
+
+    def __next__(self):
+        return self.get_next()
+
+    def __iter__(self):
+        return self
+
+    def get_next(self, timeout: float | None = None) -> T:
+        """
+        Get the next item from the producer. If the producer raises an exception, it will be reraised here.
+
+        If the producer is done, this will raise StopIteration.
+
+        Args:
+            timeout (float|None): Timeout in seconds for getting the next item. If None, will block indefinitely.
+
+        Raises:
+            Empty: If the queue is empty and the timeout is reached.
+        """
+        if self._finished:
+            raise StopIteration
+        # time_in = time.time()
+        item = ray.get(self.queue_actor.get_next.remote(timeout))
+        # time_out = time.time()
+        # if time_out - time_in > 0.1:
+        #     current_name = ray.get_runtime_context().get_actor_name()
+        #     print(f"{current_name} :: Queue get took {time_out - time_in} seconds :: {self.queue_size()}")
+        #     logger.info(f"{current_name} :: Queue get took {time_out - time_in} seconds :: {self.queue_size()}")
+        if isinstance(item, _PrefetchException):
+            item.info.reraise()
+        if isinstance(item, _Sentinel):
+            self._finished = True
+            raise StopIteration
+        return item
+
+    def stop(self):
+        ray.cancel(self.producer_task)
+        ray.get(self.queue_actor.stop.remote())
+        self._stopped = True
+
+    def is_stopped(self):
+        return self._stopped
+
+    def drain_available(self, max_size: int) -> List[T]:
+        return ray.get(self.queue_actor.drain_available.remote(max_size))
+
+
+@ray.remote
+class _QueueActor:
+    def __init__(self, max_queue_size: int):
+        self.queue: asyncio.Queue = asyncio.Queue(maxsize=max_queue_size)
+        self._stopped = False
+        self._finished = False
+
+    async def put(self, item):
+        await self.queue.put(item)
+
+    async def get_next(self, timeout: Optional[float] = None):
+        try:
+            if timeout is not None:
+                item = await asyncio.wait_for(self.queue.get(), timeout)
+            else:
+                item = await self.queue.get()
+            if isinstance(item, _Sentinel):
+                self._finished = True
+            return item
+        except asyncio.TimeoutError:
+            raise QueueEmpty
+
+    async def drain_available(self, max_size: int) -> List[T]:
+        items: list[T] = []
+        while len(items) < max_size:
+            try:
+                item = self.queue.get_nowait()
+                if isinstance(item, _Sentinel):
+                    self._finished = True
+                    break
+                if isinstance(item, _PrefetchException):
+                    item.info.reraise()
+                items.append(item)
+            except asyncio.QueueEmpty:
+                break
+        return items
+
+    async def qsize(self):
+        return self.queue.qsize()
+
+    async def stop(self):
+        self._stopped = True
+
+
+@ray.remote
+def _run_producer(queue_actor, producer_fn: Callable[[], Iterator[T]]):
+    async def _run_producer(queue_actor, producer_fn):
+        previous_put = None
+        try:
+            producer = producer_fn()
+            del producer_fn
+
+            while True:
+                next_item = next(producer)
+                if previous_put is not None:
+                    await previous_put
+                previous_put = queue_actor.put.remote(next_item)
+        except StopIteration:
+            if previous_put is not None:
+                await previous_put
+            await queue_actor.put.remote(_SENTINEL)
+        except Exception as e:
+            if previous_put is not None:
+                await previous_put
+            await queue_actor.put.remote(_PrefetchException(ser_exc_info(e)))
+
+    asyncio.run(_run_producer(queue_actor, producer_fn))
diff --git a/src/levanter/store/cache.py b/src/levanter/store/cache.py
index 56aa54f99..eae9f8402 100644
--- a/src/levanter/store/cache.py
+++ b/src/levanter/store/cache.py
@@ -1,44 +1,36 @@
 import asyncio
 import concurrent
+import copy
 import dataclasses
-import heapq
 import logging as pylogging
 import os
+import pprint
+import random
 import threading
 import time
 from asyncio import InvalidStateError
 from concurrent.futures import Future as threading_Future
 from contextlib import AbstractContextManager
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, Generic, Iterator, List, Optional, Sequence, TypeVar, Union
+from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, TypeVar, Union
 
+import deepdiff
 import fsspec.core
 import pyarrow as pa
 import ray
 from dataclasses_json import dataclass_json
 from fsspec import AbstractFileSystem
 from ray.actor import ActorHandle
+from ray.remote_function import RemoteFunction
 
 from levanter.data.dataset import AsyncDataset
+from levanter.store._prefetch_actor import QueueEmpty, RayPrefetchQueue
+from levanter.utils.py_utils import Stopwatch
 
 from ..data._preprocessor import BatchProcessor, BatchResult, dict_from_record_batch
-from ..data._queue import (
-    PriorityWorkItem,
-    PriorityWorkTaskGroup,
-    PriorityWorkTaskGroupSpec,
-    WorkQueueDispatcherActor,
-    _BatchProcessorQueue,
-)
 from ..data.metrics_monitor import InProgressCacheMetrics, LoggerMetricsMonitor, MetricsMonitor
 from ..data.sharded_datasource import ShardedDataSource
-from ..utils.ray_utils import (
-    ExceptionInfo,
-    RefBox,
-    SnitchRecipient,
-    current_actor_handle,
-    log_failures_to,
-    ser_exc_info,
-)
+from ..utils.ray_utils import ExceptionInfo, SnitchRecipient, current_actor_handle, log_failures_to, ser_exc_info
 from .tree_store import TreeStore
 
 
@@ -53,10 +45,46 @@
 DEFAULT_LOG_LEVEL = pylogging.INFO
 LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 
-# TODO: should probably do this in terms of bytes
-# this is kinda silly, but the bigger the better.
-MIN_ITEMS_TO_WRITE = 32 * 1024
-MAX_TIME_BETWEEN_WRITES = 100.0
+
+@dataclass_json
+@dataclass(frozen=True)
+class CacheOptions:
+    """
+    Configuration for a cache. This is used to configure a few parts of the cache creation process and to
+    store metadata that can be checked to ensure that the cache being loaded was created with the expected
+    configuration. It combined with the [[BatchProcessor]] metadata to form the [[CacheMetadata]].
+
+    It is intended that caching it deterministic conditional on the input data, processor, and these options.
+    """
+
+    num_shard_groups: Optional[int] = 128
+    """Number of groups to divide the shards into. This is used to parallelize the cache building process without
+    overloading Ray. If None, all shards will be in their own group."""
+    shard_order_randomization_key: Optional[int] = 0
+    """A key used to randomize the order of the shards before building and grouping."""
+    batch_size: int = 128
+    """The batch size to use when processing the data. This is used to control the memory usage of the cache building
+    process. Lower values will use less memory but take somewhat longer to build the cache."""
+
+    @staticmethod
+    def default():
+        return CacheOptions()
+
+    @staticmethod
+    def no_fanciness(batch_size: Optional[int] = None):
+        """
+        For testing, disables all the fancy features of the cache. This makes it easier to predict the behavior
+        """
+        if batch_size is None:
+            batch_size = 128
+        return CacheOptions(num_shard_groups=None, shard_order_randomization_key=None, batch_size=batch_size)
+
+    @staticmethod
+    def one_group():
+        """
+        For testing, disables all the fancy features of the cache. This makes it easier to predict the behavior
+        """
+        return CacheOptions(num_shard_groups=1, shard_order_randomization_key=None, batch_size=128)
 
 
 def build_or_load_cache(
@@ -65,8 +93,8 @@ def build_or_load_cache(
     processor: BatchProcessor[T, U],
     await_finished: bool = True,
     monitors: Optional[Sequence["MetricsMonitor"]] = None,
-    cache_config: Optional[Dict[str, Any]] = None,
-    items_per_write: int = MIN_ITEMS_TO_WRITE,
+    options: CacheOptions = CacheOptions.default(),
+    force_flush: bool = False,
 ) -> "TreeCache[U]":
     """
     Produces a sharded cache of the dataset using Ray for distributed processing. The cache can be any path
@@ -91,10 +119,9 @@ def build_or_load_cache(
         monitors: a list of MetricsMonitors to attach to the cache. These will be called periodically with
             metrics about the cache build process. If None, will add a LoggerMetricsMonitor.
 
-        cache_config: A dictionary of configuration options for the cache. This is passed to the cache writer.
+        options: Configuration for the cache. This is used to configure a few parts of the cache creation process
 
-        items_per_write: The number of items to write to the cache at a time. This is a performance tuning parameter,
-            and you probably don't need to change it. We mostly use it for testing.
+        force_flush: for testing, forces the cache to flush after every batch. This is useful for testing.
 
     Returns:
        (TreeCache) A TreeCache object that can be used to read the cache.
@@ -105,8 +132,8 @@ def build_or_load_cache(
         cache_dir=cache_dir,
         shard_source=input_shards,
         processor=processor,
-        cache_config=cache_config,
-        items_per_write=items_per_write,
+        options=options,
+        force_flush=force_flush,
     )
 
     if cache.is_finished:
@@ -129,519 +156,551 @@ def build_or_load_cache(
     return cache
 
 
-@dataclass_json
-@dataclass
-class CacheLedger:
-    # NB: unlike the old cache, the mere existence of a ledger doesn't mean the cache is finished
-    total_num_rows: int
-    shard_rows: Dict[str, int]
-    is_finished: bool = False
-    finished_shards: List[str] = dataclasses.field(default_factory=list)
-    field_counts: Dict[str, int] = dataclasses.field(default_factory=dict)
-    metadata: Dict[str, Any] = dataclasses.field(default_factory=dict)
-
-
-@dataclass
-class ShardStatus:
-    shard_name: str
-    num_rows_committed: int
-    is_finished: bool
-
-
-class SerialCacheWriter(AbstractContextManager):
-    """
-    Writes TreeCache-compatible caches to disk. This is a serial version of TreeCacheWriter that doesn't use Ray.
-    Mostly for scripts and debugging.
-
-    Examples:
-        >>> with SerialCacheWriter(cache_dir, exemplar) as writer:
-        ...     for batch in process_batches():
-        ...         writer.write_batch(batch)
-    """
+class TreeCache(AsyncDataset[T_co]):
+    ledger: Optional["CacheLedger"]
+    _builder: Optional[ActorHandle]  # handle of _TreeStoreCacheBuilder
+    # monitor_thread waits for new metrics and also periodically reloads the cache
+    _monitor_thread: Optional[threading.Thread]
+    _metrics_monitors: List[MetricsMonitor]
 
     def __init__(
         self,
         cache_dir: str,
-        exemplar: T,
-        cache_config: Optional[Dict[str, Any]] = None,
+        exemplar: T_co,
+        ledger: Optional["CacheLedger"],
+        _broker,  # handle of _TreeStoreCacheBuilder
     ):
         self.cache_dir = cache_dir
-        self.cache_config = cache_config
+        self.ledger = ledger
+        self._was_already_finished = ledger is not None and ledger.is_finished
+        self._builder = _broker
         self._exemplar = exemplar
-        self._tree_store = TreeStore.open(exemplar, self.cache_dir, mode="w")  # type: ignore
-        self._is_closed = False
 
-    def __enter__(self) -> "SerialCacheWriter":
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        # if successful, write the ledger
-        # TODO: store field counts in the ledger
-        ledger = CacheLedger(
-            total_num_rows=len(self._tree_store),
-            is_finished=True,
-            shard_rows={"": len(self._tree_store)},
-            finished_shards=[""],
-            field_counts={},
-        )
+        self._metrics_monitors = []
+        name = os.path.join(*cache_dir.split("/")[-2:])
+        self.logger = pylogging.getLogger(f"TreeCache.{name}")
+        self._store_future: threading_Future[TreeStore] = threading_Future()
+        self._stop = False
+        # assert _broker is None
 
-        if exc_type is None:
-            _serialize_json_and_commit(os.path.join(self.cache_dir, LEDGER_FILE_NAME), ledger)
-            logger.info(f"Cache ledger written to {self.cache_dir}")
-            self._is_closed = True
+        if self._builder is not None:
+            self._monitor_thread = threading.Thread(target=self._monitor_metrics, daemon=True)
+            self._monitor_thread.start()
+        else:
+            self._attempt_to_load_store()
+            assert self._store_future.done()
 
-    def result(self) -> "TreeCache":
-        if not self._is_closed:
-            raise RuntimeError("Cannot get result until TreeCacheWriter is closed")
-        return TreeCache.load(self.cache_dir, self._exemplar)
+    @property
+    def store(self) -> TreeStore[T_co]:
+        return self._store_future.result()
 
-    def write_batch(self, batch: BatchResult):
-        if isinstance(batch, pa.RecordBatch):
-            raise NotImplementedError("Only non-RecordBatch batches are supported for now")
+    async def store_async(self) -> TreeStore[T_co]:
+        if self._builder is not None:
+            return await asyncio.wrap_future(self._store_future)
+        else:
+            return self.store
 
-        batch = _canonicalize_batch(batch)  # type: ignore
+    async def async_len(self) -> int:
+        if self._builder is not None:
+            self.await_finished()
 
-        self._tree_store.extend(batch)
+        return len(await self.store_async())
 
+    def __len__(self):
+        self.await_finished()
 
-def _load_or_initialize_ledger(path):
-    try:
-        with fsspec.open(path, "r") as file:
-            return CacheLedger.from_json(file.read())
-    except FileNotFoundError:
-        return CacheLedger(0, {})
+        return len(self.store)
 
+    async def final_length_is_known(self) -> bool:
+        return self.ledger is not None and self.ledger.is_finished
 
-@ray.remote(num_cpus=0.5)  # type: ignore
-class _OrderedCacheWriter:
-    """
-    This cache writer receives examples from some number of shards (generally out of order) and writes them to the store
-    in a defined round-robin order. It also keeps track of the metadata for each shard.
+    def is_finite(self) -> bool:
+        return True
 
-    Once a shard finishes sending batches, it notifies this writer, which then updates the metadata and writes it to disk.
-    """
+    async def current_len(self) -> int:
+        if not self._store_future.done():
+            return 0
 
-    def __init__(
-        self,
-        parent,
-        name,
-        exemplar,
-        batch_size,
-        cache_dir: str,
-        shards: Sequence[str],
-        min_items_to_write=MIN_ITEMS_TO_WRITE,
-    ):
-        pylogging.basicConfig(level=DEFAULT_LOG_LEVEL, format=LOG_FORMAT)
-        with log_failures_to(parent):
-            self._parent = parent
-            self.cache_dir = cache_dir
-            self.shards = shards
-            self.batch_size = batch_size
-            self._min_items_to_write = min_items_to_write
-            self._failed = False
-            self._logger = pylogging.getLogger(name)
-
-            # these are batches that we've received but haven't ordered them for writing yet
-            self._batch_queue = GroupRoundRobinBuffer(shards)  # type: ignore
-            self._total_queue_length = 0
-            self._was_overwhelmed = False  # whether the queue has gotten too big
-            # writes are very slow (~2s) so we want to batch them up
-            self._ordered_but_unwritten_items: list = []
-            self._batches_in_next_write_by_shard: dict[str, int] = {shard: 0 for shard in shards}
-            # we also want to write every so often
-            self._last_write_time = time.time()
-
-            self._ledger = _load_or_initialize_ledger(os.path.join(cache_dir, LEDGER_FILE_NAME))
-            self._expected_num_rows: dict[str, Optional[int]] = {shard: None for shard in shards}
-
-            self._tree_store = TreeStore.open(exemplar, self.cache_dir, mode="a")
-            # careful: trim the store to the total number of rows in the cache that we've committed to
-            self._tree_store.trim_to_size(self._ledger.total_num_rows)
-            # we also have to tell the queue how many rows for each shard we've already written
-            for shard, num_rows in self._ledger.shard_rows.items():
-                if num_rows > 0:
-                    self._logger.info(f"Already written {num_rows} rows for shard {shard}")
-
-                # careful: this is in terms of batch size
-                # Have to round up to the nearest batch size
-                self._batch_queue.fast_forward(shard, div_round_up(num_rows, self.batch_size))
-                if shard in self._ledger.finished_shards:
-                    self._expected_num_rows[shard] = num_rows
-                    self._batch_queue.group_total_known(shard, div_round_up(num_rows, self.batch_size))
-
-            # double check that we're not finished by committing the ledger
-            self._attempt_to_write_batches()
-
-            if not self._ledger.is_finished:
-                self._actual_writer_thread = threading.Thread(target=self._write_loop, daemon=True)
-                self._stop_loop = threading.Event()
-                self._actual_writer_thread.start()
-
-    def batch_finished(self, shard_name: str, shard_batch_idx: int, batch_result_box):
-        with log_failures_to(self._parent):
-            if self._failed:
-                self._logger.warning("Received batch after failure. Ignoring.")
-                return
+        return len(await self.store_async())
 
-            if isinstance(batch_result_box, RefBox):
-                batch_result = ray.get(batch_result_box.ref)
-            else:
-                batch_result = batch_result_box
-
-            # we need to keep track of the order of the batches so that we can write them out in order
-            self._total_queue_length += len(batch_result)
-            self._batch_queue.append_to_group(shard_name, shard_batch_idx, batch_result)
-            next_missing_item = self._batch_queue.next_missing_item_index()
-
-            overwhelmed = self.is_overwhelmed()
-            if overwhelmed:
-                if not self._was_overwhelmed:
-                    self._logger.warning(f"Writer queue is getting long ({self._total_queue_length}).")
-                self._parent.signal_backpressure.remote(next_missing_item)
-            elif self._was_overwhelmed:
-                self._logger.info(f"Writer queue is no longer overwhelmed ({self._total_queue_length}).")
-                self._parent.signal_backpressure.remote(None)
-
-            self._was_overwhelmed = overwhelmed
-
-    def shard_failed(self, shard_name: str, batch_id: int, exc_info: ExceptionInfo):
-        with log_failures_to(self._parent):
-            self._failed = True
-            self._stop_loop.set()
-            logger.error(f"Shard {shard_name} failed at batch {batch_id}", exc_info=exc_info.restore())
-            self._parent.shard_failed.remote(shard_name, exc_info)
-
-    def shard_finished_reading(self, shard_name: str, expected_num_rows: int):
-        with log_failures_to(self._parent):
-            # careful: this is in terms of batch size
-            self._batch_queue.group_total_known(shard_name, div_round_up(expected_num_rows, self.batch_size))
-            self._expected_num_rows[shard_name] = expected_num_rows
-            logger.debug(
-                f"Attempting to write batches because {shard_name} finished reading with {expected_num_rows} batches."
-            )
-            self.flush()
+    async def get_batch(self, indices: Sequence[int] | slice):
+        # this is tricky: we want to wait until either the cache is finished or we have the max index
+        if isinstance(indices, slice):
+            start, step, stop = await self._get_start_stops_async(indices)
+            await self._wait_for_len(max(stop, start))
+            indices = range(start, stop, step)
 
-    def flush(self):
-        self._attempt_to_write_batches()
+        max_index = max(indices)
+        await self._wait_for_len(max_index + 1)
 
-    def get_shard_status(self, shard_name: str):
-        with log_failures_to(self._parent):
-            rows = self._ledger.shard_rows.get(shard_name, 0)
-            is_finished = shard_name in self._ledger.finished_shards
-            return ShardStatus(shard_name, rows, is_finished)
+        return await self.store.get_batch(indices)
 
-    def get_ledger(self):
-        return self._ledger
+    async def _wait_for_len(self, needed_len: int):
+        if self._builder is not None:
+            while needed_len > await self.current_len():
+                new_ledger: CacheLedger = await self._builder.updated_ledger.remote()
 
-    def _attempt_to_write_batches(self):
-        if self._ledger.is_finished:
-            return
+                if needed_len <= new_ledger.total_num_rows:
+                    break
 
-        if self._failed:
-            logger.warning("Not writing batches because of failure.")
-            return
+                if new_ledger.is_finished:
+                    if needed_len >= new_ledger.total_num_rows:
+                        raise IndexError(
+                            f"Index {needed_len} out of bounds for cache of size {new_ledger.total_num_rows}"
+                        )
+                    break
+        else:
+            if needed_len > len(self.store):
+                raise IndexError(f"Index {needed_len} out of bounds for cache of size {len(self.store)}")
 
-        self._dequeue_ready_batches()
-        updated_shards = self._write_available_batches()
+    def _wait_for_len_sync(self, needed_len, timeout: Optional[float] = None):
+        time_in = time.time()
+        t_max = time_in + (timeout or 1e6)
+        if self._builder is not None:
+            while needed_len > len(self.store):
+                cur_time = time.time()
+                if cur_time > t_max:
+                    raise TimeoutError(f"Timed out waiting for cache to reach {needed_len}")
+                try:
+                    new_ledger: CacheLedger = ray.get(
+                        self._builder.updated_ledger.remote(), timeout=max(t_max - cur_time, 10)
+                    )
+                except TimeoutError:
+                    continue
 
-        logger.debug(f"Updated shards: {updated_shards}")
+                if needed_len <= new_ledger.total_num_rows:
+                    break
 
-        need_to_commit = len(updated_shards) > 0
-        total_rows = self._ledger.total_num_rows + sum(updated_shards.values())
+                if new_ledger.is_finished:
+                    if needed_len >= new_ledger.total_num_rows:
+                        raise IndexError(
+                            f"Index {needed_len} out of bounds for cache of size {new_ledger.total_num_rows}"
+                        )
+                    break
+        else:
+            if needed_len > len(self.store):
+                raise IndexError(f"Index {needed_len} out of bounds for cache of size {len(self.store)}")
 
-        for shard, num_rows in updated_shards.items():
-            self._ledger.shard_rows[shard] = self._ledger.shard_rows.get(shard, 0) + num_rows
+    @staticmethod
+    def load(cache_dir: str, exemplar: T, options: Optional["CacheMetadata"] = None) -> "TreeCache":
+        """Loads a cache from disk or an object store. Raises FileNotFoundError if the cache doesn't exist"""
+        logger.info(f"Loading cache from {cache_dir}")
+        ledger = CacheLedger.load(cache_dir, options)
+        if not ledger.is_finished:
+            raise FileNotFoundError(f"Cache at {cache_dir} is not finished. Use build_or_load to build it.")
+        return TreeCache(cache_dir, exemplar, ledger, None)
 
-        futures_to_await_shards, need_to_commit_for_shards = self._check_for_finished_shards()
+    @staticmethod
+    def build_or_load(
+        cache_dir: str,
+        shard_source: ShardedDataSource[T],
+        processor: BatchProcessor[T, U],
+        options: Optional["CacheOptions"] = None,
+        force_flush: bool = False,
+    ) -> "TreeCache[U]":
+        if options is None:
+            options = CacheOptions.default()
+        metadata = CacheMetadata(options=options, preprocessor_metadata=processor.metadata)
+        try:
+            return TreeCache.load(cache_dir, processor.output_exemplar, metadata)
+        except FileNotFoundError:
+            broker = _get_builder_actor(
+                cache_dir=cache_dir,
+                shard_source=shard_source,
+                processor=processor,
+                options=options,
+                force_flush=force_flush,
+            )
+            return TreeCache(cache_dir=cache_dir, exemplar=processor.output_exemplar, ledger=None, _broker=broker)
 
-        need_to_commit = need_to_commit or need_to_commit_for_shards
+    def finished_sentinel(self):
+        """Returns a Ray-awaitable object that will be set when the cache is finished"""
+        if self._builder is None:
+            return ray.remote(num_cpus=0)(lambda: None).remote()
+        else:
+            return self._builder.finished_sentinel.remote()
 
-        futures_to_await = []
-        if need_to_commit:
-            self._ledger.total_num_rows = total_rows
-            _serialize_json_and_commit(os.path.join(self.cache_dir, LEDGER_FILE_NAME), self._ledger)
+    @property
+    def is_finished(self):
+        return self.ledger is not None and self.ledger.is_finished
 
-            futures_to_await.append(self._parent._updated_ledger.remote(self._ledger))
+    def __getitem__(self, item):
+        if isinstance(item, slice):
+            start, step, stop = self._get_start_stops(item)
+            # TODO: wait for store to be set
+            return self.store[start:stop:step]
+        else:
+            if item < 0:
+                item += len(self)
+            if item < 0 or item >= len(self):
+                raise IndexError(f"Index {item} out of bounds for cache of size {len(self)}")
+            return self.store[item]
 
-            if self._ledger.is_finished:
-                f = self._parent._finalize.remote()
-                futures_to_await.append(f)
+    def get_batch_sync(self, indices_or_slice, *, timeout: Optional[float] = None):
+        store = self.store
+        if isinstance(indices_or_slice, slice):
+            start, step, stop = self._get_start_stops(indices_or_slice)
+            indices_or_slice = range(start, stop, step)
 
-        ray.wait(futures_to_await + futures_to_await_shards)
+        max_index = max(indices_or_slice)
 
-    def _finish(self):
-        self._stop_loop.set()
-        self._actual_writer_thread.join()
+        self._wait_for_len_sync(max_index + 1, timeout=timeout)
 
-    def _write_loop(self):
-        while True:
-            try:
-                self._stop_loop.wait(1)
-                if self._stop_loop.is_set():
-                    break
-            except TimeoutError:
-                pass
-            self._attempt_to_write_batches()
-            if self._ledger.is_finished:
-                break
-
-    def _dequeue_ready_batches(self):
-        for shard, batch in self._batch_queue.drain():
-            logger.debug(f"Writing batch for {shard}")
-            self._total_queue_length -= len(batch)
-            self._ordered_but_unwritten_items.extend(batch)
-            self._batches_in_next_write_by_shard[shard] = self._batches_in_next_write_by_shard.get(shard, 0) + len(
-                batch
-            )
+        return store.get_batch_sync(indices_or_slice)
 
-    def _write_available_batches(self):
-        if len(self._ordered_but_unwritten_items) == 0:
-            return {}
+    def _get_start_stops(self, slice):
+        start = slice.start or 0
+        if slice.stop is None:
+            stop = len(self)
+        elif slice.stop < 0:
+            stop = len(self) + slice.stop
+        else:
+            stop = slice.stop
+        if start < 0:
+            start = len(self) + slice.start
+        step = slice.step or 1
+        return start, step, stop
 
-        any_shard_finished_reading = any(num_rows is not None for num_rows in self._expected_num_rows.values())
-
-        if (
-            len(self._ordered_but_unwritten_items) >= self._min_items_to_write
-            or (time.time() - self._last_write_time > MAX_TIME_BETWEEN_WRITES)
-            or any_shard_finished_reading
-        ):
-            time_in = time.time()
-            self._tree_store.extend(self._ordered_but_unwritten_items)
-            time_out = time.time()
-            logger.debug(f"Wrote {len(self._ordered_but_unwritten_items)} rows in {time_out - time_in:.2f} seconds")
-            self._ordered_but_unwritten_items = []
-
-            written_by_shard = self._batches_in_next_write_by_shard
-            self._batches_in_next_write_by_shard = {}
-            self._last_write_time = time.time()
-            return written_by_shard
+    async def _get_start_stops_async(self, slice):
+        start = slice.start or 0
+        if slice.stop is None:
+            stop = await self.async_len()
+        elif slice.stop < 0:
+            stop = (await self.async_len()) + slice.stop
         else:
-            return {}
+            stop = slice.stop
+        if start < 0:
+            start = (await self.async_len()) + slice.start
 
-    def _check_for_finished_shards(self):
-        futures_to_await_shards = []
-        need_to_commit_for_shards = False
-        for shard, expected_rows in self._expected_num_rows.items():
-            if expected_rows is None:
-                continue
-
-            current_rows = self._ledger.shard_rows.get(shard, 0)
-            if current_rows == expected_rows:
-                if shard not in self._ledger.finished_shards:
-                    logger.info(f"Shard {shard} finished.")
-                    self._ledger.finished_shards.append(shard)
-                    futures_to_await_shards.append(self._parent.shard_finished.remote(shard))
-                    need_to_commit_for_shards = True
-            elif current_rows > expected_rows:
-                raise ValueError(f"Shard {shard} has more rows than expected: {current_rows} > {expected_rows}")
-
-        if len(self._ledger.finished_shards) == len(self.shards) and set(self._ledger.finished_shards) == set(
-            self.shards
-        ):
-            self._ledger.is_finished = True
-            need_to_commit_for_shards = True
-        return futures_to_await_shards, need_to_commit_for_shards
-
-    def is_overwhelmed(self) -> bool:
-        max_queue_size = self._min_items_to_write * 3
-        return self._total_queue_length > max_queue_size
-
-    def __del__(self):
-        self._finish()
+        step = slice.step or 1
+        return start, step, stop
 
+    def await_finished(self, timeout: Optional[float] = None):
+        if self._builder is None:
+            return
+        x = ray.get(self.finished_sentinel(), timeout=timeout)
+        self._attempt_to_load_store()
+        return x
 
-def _to_list_of_dicts(batch: dict) -> List[dict]:
-    """
-    Convert a batch of dictionaries to a list of dictionaries, suitable for writing to a cache.
-    """
-    keys = list(batch.keys())
-    values = list(batch.values())
-    num_rows = len(values[0])
-    return [{key: values[i][j] for i, key in enumerate(keys)} for j in range(num_rows)]
+    async def finished(self):
+        if self._builder is None:
+            return
+        x = await self.finished_sentinel()
+        # TODO: make an async version of this
+        self._attempt_to_load_store()
+        return x
 
+    def _attempt_to_load_store(self):
+        if self._store_future.done():
+            return
 
-def _canonicalize_batch(batch: Union[dict, List[dict]]) -> List[dict]:
-    if isinstance(batch, pa.RecordBatch):
-        batch = dict_from_record_batch(batch)
+        try:
+            store = TreeStore.open(self._exemplar, self.cache_dir, mode="r")
+        except FileNotFoundError:
+            assert self._builder is not None
+            ledger = ray.get(self._builder.current_ledger.remote())
+            metrics = _ledger_to_metrics(ledger)
+            if metrics.rows_finished == 0 and metrics.is_finished:
+                # this means we built an empty cache. go with it
+                store = TreeStore.open(self._exemplar, f"memory://{self.cache_dir}", mode="a")
+            else:
+                raise
+        try:
+            self._store_future.set_result(store)
+        except concurrent.futures.InvalidStateError:
+            pass
 
-    if isinstance(batch, dict):
-        return _to_list_of_dicts(batch)
-    else:
-        return batch
+    def attach_metrics_monitor(self, monitor: MetricsMonitor):
+        if self._builder is None:
+            logger.warning("Cannot attach metrics monitor to finished cache.")
+            # TODO: decide what to do about attaching if the cache is already finished
+            # maybe get the final metrics?
+            return
+
+        self._metrics_monitors.append(monitor)
+
+    def _monitor_metrics(self):
+        while not self._stop:
+            try:
+                try:
+                    # it's better to let the Ray actor handle the timeout
+                    ledger_or_timeout = ray.get(self._builder.updated_ledger.remote(timeout=4.0), timeout=10.0)
+                    if isinstance(ledger_or_timeout, Exception):
+                        raise ledger_or_timeout
+                    self.ledger = ledger_or_timeout
+                    metrics = _ledger_to_metrics(self.ledger)
+                    for monitor in self._metrics_monitors:
+                        monitor(metrics)
+                    if metrics.is_finished:
+                        break
+                except TimeoutError:
+                    pass
+                except Exception as e:
+                    if str(e).startswith("Failed to submit task to actor"):
+                        logger.warning("Cache builder actor is gone. Stopping monitoring.")
+                        break
+                    else:
+                        raise
+                try:
+                    self._attempt_to_load_store()
+                except FileNotFoundError:
+                    pass
+            except Exception as e:
+                if str(e).startswith("Failed to submit task to actor"):
+                    logger.warning("Cache builder actor is gone. Stopping monitoring.")
+                    break
+                else:
+                    self.logger.exception("Error while reading metrics from shard cache.")
+                    raise e
 
 
-# thinking through the design of the cache system
+@dataclass_json
+@dataclass
+class CacheLedger:
+    # NB: unlike the old cache, the mere existence of a ledger doesn't mean the cache is finished
+    total_num_rows: int
+    shard_rows: Dict[str, int]
+    is_finished: bool = False
+    finished_shards: List[str] = dataclasses.field(default_factory=list)
+    field_counts: Dict[str, int] = dataclasses.field(default_factory=dict)
+    metadata: "CacheMetadata" = dataclasses.field(default_factory=lambda: CacheMetadata(CacheOptions(), {}))
+
+    @staticmethod
+    def load_or_initialize(
+        cache_dir: str, source: ShardedDataSource, processor: BatchProcessor, config: "CacheOptions"
+    ):
+        metadata = CacheMetadata(options=config, preprocessor_metadata=processor.metadata)
+        try:
+            return CacheLedger.load(cache_dir, metadata)
+        except FileNotFoundError:
+            return CacheLedger(
+                total_num_rows=0,
+                shard_rows={shard: 0 for shard in source.shard_names},
+                is_finished=False,
+                metadata=metadata,
+            )
 
-# we decided to use Ray, which was maybe a mistake, but here we are.
-# Ray doesn't like it when the number of actors gets too large, so we can't have one actor per shard.
-# we have N nodes and K shards.
+    @staticmethod
+    def load(cache_dir: str, metadata: Optional["CacheMetadata"] = None) -> "CacheLedger":
+        ledger_path = os.path.join(cache_dir, LEDGER_FILE_NAME)
+        try:
+            logger.debug(f"Attempting to load cache ledger from {ledger_path}")
+            with fsspec.open(ledger_path) as file:
+                cache_ledger = CacheLedger.from_json(file.read())  # type: ignore
+            if metadata:
+                diff = cache_ledger.metadata.compare_to(metadata)
+                if not diff:
+                    logger.debug("Metadata matches")
+                else:
+                    logger.warning(f"Metadata mismatch: {pprint.pformat(diff, indent=2)}")
+            return cache_ledger
+        except FileNotFoundError:
+            raise FileNotFoundError(f"Cache ledger not found at {ledger_path}")
 
-# at a high level, we have 3 steps:
-# 1. read batches from the shard source
-# 2. process batches
-# 3. write batches to the cache for that shard
+    def _serialize_and_commit(self, cache_dir):
+        path = os.path.join(cache_dir, LEDGER_FILE_NAME)
+        return _serialize_json_and_commit(path, self)  # type: ignore
 
-# The difficulty is that we want parallelism, and we want to control the order of the written data.
-# Reading batches requires CPU and network.
-# ==> This means we should limit the number of shard groups to roughly the number of nodes, maybe times 2.
-# We ideally want to read from shards roughly evenly (at least within a group of shards)
 
+@dataclass_json
+@dataclass(frozen=True)
+class CacheMetadata:
+    options: CacheOptions = CacheOptions.default()
+    preprocessor_metadata: Optional[dict[str, Any]] = None
 
-def _shard_reader_generator(shard_source: ShardedDataSource[T], shard_name: str, start_row: int, batch_size: int):
-    shard_iter = shard_source.open_shard_at_row(shard_name, start_row)
-    batch = []
-    for row in shard_iter:
-        batch.append(row)
+    def compare_to(self, other: "CacheMetadata") -> deepdiff.DeepDiff:
+        """
+        Compare this metadata to another set of metadata. This is used to check if the cache being loaded
+        was created with the expected configuration.
 
-        if len(batch) == batch_size:
-            yield batch
-            batch = []
+        if other.preprocessor_metadata is None, we ignore it for the purposes of comparison.
+        """
+        if other.preprocessor_metadata is None:
+            sorta_self = dataclasses.replace(self, preprocessor_metadata=None)
+        else:
+            sorta_self = self
+        return deepdiff.DeepDiff(sorta_self, other)
 
-    if len(batch) > 0:
-        yield batch
+    @staticmethod
+    def empty():
+        return CacheMetadata()
 
 
 @dataclass
-class ShardGroupToBeProcessed(PriorityWorkTaskGroupSpec):
-    name: str
-    builder_ref: ray.actor.ActorHandle  # _TreeStoreCacheBuilder
-    writer: ray.actor.ActorHandle  # _GroupedShardWriter
-    shard_source: ShardedDataSource
-    shard_names: Sequence[str]
-    priority_fn: Callable[[int, int], float]
-    processor_actor: ray.actor.ActorHandle  # BatchProcessorQueue
-    batch_size: int
-    group_id: int
-
-    def build(self) -> "PriorityWorkTaskGroup":
-        return ShardGroupTaskGroup(self)
-
-
-class ShardGroupTaskGroup(PriorityWorkTaskGroup):
-    def __init__(self, spec: ShardGroupToBeProcessed):
-        self.spec: ShardGroupToBeProcessed = spec
-        self.logger = pylogging.getLogger(f"shard_reader.{spec.group_id}.{spec.name}")
-
-        current_shard_status: dict[str, ShardStatus] = {}
-        for shard_name in self.spec.shard_names:
-            try:
-                current_shard_status[shard_name] = ray.get(self.spec.writer.get_shard_status.remote(shard_name))
-            except Exception as e:
-                self.spec.builder_ref.shard_failed.remote(shard_name, ser_exc_info())
-                raise e
+class _ShardStatus:
+    shard_name: str
+    num_rows_committed: int
+    is_finished: bool
 
-        batch_size = self.spec.batch_size
 
-        self._items: list[PriorityWorkItem] = []
+class SerialCacheWriter(AbstractContextManager):
+    """
+    Writes TreeCache-compatible caches to disk. This is a serial version of TreeCacheWriter that doesn't use Ray.
+    Mostly for scripts and debugging.
 
-        for shard_name in self.spec.shard_names:
-            try:
-                status = current_shard_status[shard_name]
-                if status.is_finished:
-                    self.logger.info(f"Shard {shard_name} already finished. Skipping.")
-                    continue
+    Examples:
+        >>> with SerialCacheWriter(cache_dir,exemplar) as writer:
+        ...     for batch in process_batches():
+        ...         writer.write_batch(batch)
+    """
 
-                reader = _shard_reader_generator(
-                    self.spec.shard_source, shard_name, status.num_rows_committed, batch_size
-                )
+    def __init__(
+        self,
+        cache_dir: str,
+        exemplar: T,
+        metadata: Optional["CacheMetadata"] = None,
+    ):
+        self.cache_dir = cache_dir
+        self.metadata = metadata
+        self._exemplar = exemplar
+        self._tree_store = TreeStore.open(exemplar, self.cache_dir, mode="w", cache_metadata=True)
+        self._is_closed = False
+
+    def __enter__(self) -> "SerialCacheWriter":
+        return self
 
-                task_name = f"shard_reader.{self.spec.name}.{shard_name}"
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        # if successful, write the ledger
+        # TODO: store field counts in the ledger
+        ledger = CacheLedger(
+            total_num_rows=len(self._tree_store),
+            is_finished=True,
+            shard_rows={"": len(self._tree_store)},
+            finished_shards=[""],
+            field_counts={},
+            metadata=self.metadata or CacheMetadata.empty(),
+        )
 
-                batch_idx = status.num_rows_committed // batch_size
+        if exc_type is None:
+            ledger._serialize_and_commit(self.cache_dir)
+            logger.info(f"Cache ledger written to {self.cache_dir}")
+            self._is_closed = True
 
-                shard_idx = self.spec.shard_source.shard_names.index(shard_name)
-                item = ShardReaderItem(
-                    self,
-                    task_name,
-                    shard_name,
-                    shard_idx,
-                    batch_idx=batch_idx,
-                    reader=reader,
-                    current_row=status.num_rows_committed,
-                )
+    def result(self) -> "TreeCache":
+        if not self._is_closed:
+            raise RuntimeError("Cannot get result until TreeCacheWriter is closed")
+        return TreeCache.load(self.cache_dir, self._exemplar, self.metadata)
 
-                heapq.heappush(self._items, item)
-            except Exception as e:
-                self.logger.exception(f"Error while initializing shard {shard_name}")
-                self.spec.writer[shard_name].shard_failed.remote(ser_exc_info())
-                raise e
+    def write_batch(self, batch: BatchResult):
+        if isinstance(batch, pa.RecordBatch):
+            raise NotImplementedError("Only non-RecordBatch batches are supported for now")
 
-    @property
-    def name(self):
-        return self.spec.name
+        batch = _canonicalize_batch(batch)  # type: ignore
 
-    def items(self) -> Sequence["PriorityWorkItem"]:
-        return self._items
+        self._tree_store.extend(batch)
 
 
-# NB This class is stateful
-@dataclass
-class ShardReaderItem(PriorityWorkItem):
+class ShardedCacheWriter:
     """
-    Each time execute is called, this class reads a batch of examples from the shard
-    and dispatches them to the processor.
+    Similar to SerialCacheWriter, but tracks shard metadata.
+
+    Similar to _OrderedCacheWriter, it also supports resuming, and it
+    groups together batches before writing (at some interval) in order to improve performance.
     """
 
-    group: ShardGroupTaskGroup
-    name: str
-    shard_name: str
-    shard_idx: int
-    batch_idx: int
-    reader: Iterator[list]
-    current_row: int = 0
+    def __init__(
+        self,
+        cache_dir: str,
+        initial_ledger: CacheLedger,
+        exemplar: T,
+        on_write: Optional[Callable[[CacheLedger], None]] = None,
+    ):
+        self.cache_dir = cache_dir
+        self._on_write = on_write
+
+        self._ledger = copy.deepcopy(initial_ledger)
+
+        self._tree_store = TreeStore.open(exemplar, self.cache_dir, mode="a")  # type: ignore
+        self._tree_store.trim_to_size(self._ledger.total_num_rows)
+        self._items_ready_to_write: list = []
 
     @property
-    def priority(self):
-        return self.group.spec.priority_fn(self.shard_idx, self.batch_idx)
+    def ledger(self):
+        return self._ledger
+
+    # we have both versions b/c we need this one for actors
+    def get_ledger(self):
+        return self._ledger
 
     @property
-    def spec(self):
-        return self.group.spec
+    def is_finished(self):
+        return self._ledger.is_finished
 
-    def execute(self) -> tuple[bool, Optional[ray.ObjectRef]]:
-        writer = self.spec.writer
-        write_finished_ref = None
+    def finish_shard(self, shard_name: str, num_rows: int):
+        self.flush()
+        current_rows = self._ledger.shard_rows.get(shard_name, 0)
+        if current_rows != num_rows:
+            raise ValueError(f"Expected {num_rows} rows in finished shard {shard_name}, but found {current_rows}")
 
-        self.group.logger.debug(f"Reading one batch of shard {self.shard_name}: {self.batch_idx}")
+        self._ledger.finished_shards.append(shard_name)
+        self._ledger._serialize_and_commit(self.cache_dir)
 
-        try:
-            batch = next(self.reader, None)
-            exhausted_shard = batch is None or (len(batch) < self.spec.batch_size)
+    def write_batch(self, shard_name: str, batch: BatchResult):
+        if self.is_finished:
+            raise RuntimeError("Cannot write to a finished cache")
 
-            if batch:
-                priority = self.spec.priority_fn(self.shard_idx, self.batch_idx)
-                try:
-                    batch_result_ref = ray.get(
-                        self.spec.processor_actor.submit.remote(
-                            priority=priority,
-                            desc=f"{self.shard_name}.{self.batch_idx}",
-                            batch=RefBox(ray.put(batch)),
-                        )
-                    )
-                    logger.debug(f"Got batch result: {batch_result_ref}")
-                    write_finished_ref = writer.batch_finished.remote(
-                        self.shard_name, self.batch_idx, RefBox(batch_result_ref)
-                    )
-                    self.batch_idx += 1
-                    self.current_row += len(batch)
-                except Exception as e:
-                    self.group.logger.exception(f"Error while processing batch {self.batch_idx}")
-                    # fire and forget
-                    writer.shard_failed.remote(self.shard_name, self.batch_idx, ser_exc_info())
-                    raise e
+        if isinstance(batch, pa.RecordBatch):
+            raise NotImplementedError("Only non-RecordBatch batches are supported for now")
 
-            if exhausted_shard:
-                logger.info(f"Shard {self.shard_name} exhausted. Expecting {self.current_row} rows.")
-                writer.shard_finished_reading.remote(self.shard_name, self.current_row)
+        batch = _canonicalize_batch(batch)  # type: ignore
 
-            self.group.logger.debug(f"Finished reading one batch of shard {self.shard_name}: {self.batch_idx}")
+        self._items_ready_to_write.append((shard_name, batch))
 
-            return exhausted_shard, write_finished_ref
-        except Exception as e:  # noqa
-            self.group.logger.exception(f"Error while processing shard {self.shard_name}")
-            # fire and forget
-            writer.shard_failed.remote(self.shard_name, self.batch_idx, ser_exc_info())
-            raise e
+    def flush(self):
+        self._attempt_to_write_batches()
+
+    def finish(self):
+        self.flush()
+
+        # if successful, write the ledger
+        logger.info("Finished writing cache")
+        self._ledger.is_finished = True
+        self._ledger._serialize_and_commit(self.cache_dir)
+        if self._on_write:
+            self._on_write(self._ledger)
+
+        return self._tree_store
+
+    def _attempt_to_write_batches(self):
+        if self._ledger.is_finished:
+            return
+
+        if not self._items_ready_to_write:
+            return
+
+        updated_shards = self._write_available_batches()
+
+        logger.debug(f"Updated shards: {updated_shards}")
+
+        did_write = len(updated_shards) > 0
+
+        if did_write:
+
+            for shard, num_rows in updated_shards.items():
+                self._ledger.shard_rows[shard] = self._ledger.shard_rows.get(shard, 0) + num_rows
+
+            total_rows = self._ledger.total_num_rows + sum(updated_shards.values())
+            self._ledger.total_num_rows = total_rows
+            self._ledger._serialize_and_commit(self.cache_dir)
+
+            if self._on_write:
+                self._on_write(self._ledger)
+
+    def _write_available_batches(self):
+        ready = self._items_ready_to_write
+        self._items_ready_to_write = []
+
+        if len(ready) == 0:
+            return {}
+
+        to_write = []
+        written_by_shard = {}
+        for shard, batch in ready:
+            to_write.extend(batch)
+            written_by_shard[shard] = written_by_shard.get(shard, 0) + len(batch)
+
+        self._tree_store.extend(to_write)
+        return written_by_shard
 
 
 def _serialize_json_and_commit(path, obj):
@@ -657,17 +716,6 @@ def _serialize_json_and_commit(path, obj):
     fs.rename(f"{path}.tmp", path)
 
 
-def _load_cache_ledger(cache_dir) -> CacheLedger:
-    try:
-        ledger_path = os.path.join(cache_dir, LEDGER_FILE_NAME)
-        logger.debug(f"Attempting to load cache ledger from {ledger_path}")
-        with fsspec.open(ledger_path) as file:
-            cache_ledger = CacheLedger.from_json(file.read())  # type: ignore
-        return cache_ledger
-    except FileNotFoundError:
-        raise FileNotFoundError(f"Cache ledger not found at {ledger_path}")
-
-
 @ray.remote(num_cpus=0.1)  # keep this small b/c it doesn't do a lot
 class _TreeStoreCacheBuilder(SnitchRecipient):
     """
@@ -682,119 +730,42 @@ def __init__(
         name: str,
         source: ShardedDataSource[T],
         processor: BatchProcessor[T, U],
-        cache_config: Dict[str, Any],
-        min_items_to_write: int,
+        options: CacheOptions,
+        force_flush: bool,
     ):
         pylogging.basicConfig(level=DEFAULT_LOG_LEVEL, format=LOG_FORMAT)
         self.logger = pylogging.getLogger(f"{__name__}.{name}")
-        self.source = source
-        self._cache_dir = cache_dir
-        # self._metrics = InProgressCacheMetrics()
-        self._updated_ledger_condition = asyncio.Condition()
-        self._ledger = CacheLedger(0, {})
-        self.shards_in_progress: set[str] = set()
-        exemplar = processor.output_exemplar
-
         self._finished_promise: asyncio.Future[None] = asyncio.Future()
-        # used to subscribe to metrics updates
-        self._cache_config = cache_config
-        path_for_name = os.path.join(*self._cache_dir.split("/")[-2:])
-        name = f"broker::{path_for_name}"
-        self.logger = pylogging.getLogger(f"{name}")
-        self._cache_writer: Optional[ActorHandle] = _OrderedCacheWriter.remote(  # type: ignore
-            current_actor_handle(),
-            f"writer::{path_for_name}",
-            exemplar,
-            processor.batch_size,
-            cache_dir,
-            source.shard_names,
-            min_items_to_write,
-        )
-
         try:
-            cache_ledger = _load_cache_ledger(self._cache_dir)
-            self._ledger = cache_ledger
-        except FileNotFoundError:
-            pass
-
-        if self._ledger.is_finished:
-            self._finished_promise.set_result(None)
-        self._start_workers(cache_dir, name, processor, source)
-
-    def _start_workers(self, cache_dir, name, processor, source):
-        if len(source.shard_names) == 0:
-            self.logger.warning("No shards to index?!?")
-            self._finalize()
-        else:
-            self.logger.debug(f"Starting cache build for {source.shard_names}")
-            self.logger.info(f"Starting cache build for {len(source.shard_names)} shards")
-
-            self_ref = current_actor_handle()
-
-            self._shard_writers = []
-            self._shard_readers = []
-            self._processor_actors = []
-
-            for shard_name in source.shard_names:
-                self.shards_in_progress.add(shard_name)
-
-            num_shards = len(source.shard_names)
-            num_worker_groups = len(ray.nodes())
-            num_shard_groups = max(min(num_worker_groups, num_shards), 1)
-
-            # if we have a bunch of caches to build with one shard, we don't want them all
-            # assigned to the same node, so we use an offset based on the hash of the name (for stability)
-            # in an attempt to spread them out
-            group_offset = int(hash(name) % num_worker_groups)
-
-            shard_groups: list[list[str]] = [[] for _ in range(num_shard_groups)]
-            for i, shard_name in enumerate(source.shard_names):
-                shard_groups[i % num_shard_groups].append(shard_name)
-
-            def priority_fn(shard_idx, batch_idx):
-                return batch_idx * num_shards + shard_idx
-
-            for group_id, shard_group in enumerate(shard_groups):
-                # TODO: would probably be better if we didn't create one of these per shard group
-                processor_actor = _BatchProcessorQueue.remote(processor)  # type: ignore
-                self._processor_actors.append(processor_actor)
-
-                assert self._cache_writer is not None
-
-                work_item = ShardGroupToBeProcessed(
-                    name=name,
-                    builder_ref=self_ref,
-                    writer=self._cache_writer,
-                    shard_source=source,
-                    shard_names=shard_group,
-                    priority_fn=priority_fn,
-                    processor_actor=processor_actor,
-                    batch_size=processor.batch_size,
-                    group_id=group_id,
-                )
+            self.source = source
+            self._cache_dir = cache_dir
+            self._options = options
+            self._updated_ledger_condition = asyncio.Condition()  # used to subscribe to metrics updates
 
-                # we want global names so that different tasks can coordinate priorities
-                worker_to_assign = (group_id + group_offset) % num_worker_groups
-                priority_actor_name = f"priority_processor.{worker_to_assign}"
+            self._ledger = CacheLedger.load_or_initialize(cache_dir, source, processor, options)
 
-                reader_actor = WorkQueueDispatcherActor.options(  # type: ignore
-                    name=priority_actor_name, get_if_exists=True
-                ).remote()
-
-                reader_actor.assign_work.remote(work_item)
-                self._shard_readers.append(reader_actor)
+            if self._ledger.is_finished:
+                self._finished_promise.set_result(None)
 
-    def shard_finished(self, shard_name: str):
-        """Callback method for when a shard worker has finished."""
-        self.shards_in_progress.remove(shard_name)
+            path_for_name = os.path.join(*self._cache_dir.split("/")[-2:])
+            name = f"broker::{path_for_name}"
+            self.logger = pylogging.getLogger(f"{name}")
 
-    def shard_failed(self, shard_name: str, error: ExceptionInfo):
-        """Callback method for when a shard worker has failed."""
-        self._writer_exception(shard_name, error)
+            if self._ledger.is_finished:
+                self.logger.info("Cache already finished. Nothing to do.")
+                return
+            self._cache_writer = _core_writer_task.remote(
+                current_actor_handle(), cache_dir, self._ledger, source, processor, force_flush
+            )
+        except Exception:
+            # Ray behaves poorly if the constructor of an actor fails, so we catch and log here
+            # this also propagates to the finished promise, so we can handle it there
+            self._writer_exception(None, ser_exc_info())
 
-    def _updated_ledger(self, ledger: CacheLedger):
-        self._ledger = ledger
-        self._do_notify()
+    def current_ledger(self):
+        if self._finished_promise.done() and self._finished_promise.exception() is not None:
+            raise self._finished_promise.exception()
+        return self._ledger
 
     def other_failed(self, error: ExceptionInfo):
         """Callback method for when a shard worker has failed."""
@@ -805,21 +776,37 @@ def _child_failed(self, child: ray.actor.ActorHandle, exception: ExceptionInfo):
         self._writer_exception(None, exception)
 
     def is_finished(self):
+        if self.failed():
+            return False
         return self._ledger.is_finished
 
+    def failed(self):
+        return self._finished_promise.done() and self._finished_promise.exception() is not None
+
     async def finished_sentinel(self):
         await self._finished_promise
 
-    async def updated_ledger(self) -> CacheLedger:
+    async def updated_ledger(self, timeout: float | None = None) -> CacheLedger | TimeoutError:
+        """
+        NB: we **return** a timeout error, we don't raise it. This is because we want to find real failures
+        in the ray dashboard, and it's a real pain to find exceptions in the logs.
+        """
         if self._finished_promise.done():
             if self._finished_promise.exception() is not None:
                 raise self._finished_promise.exception()  # type: ignore
             else:
                 return self._ledger
 
-        async with self._updated_ledger_condition:
-            await self._updated_ledger_condition.wait()
+        try:
+            async with self._updated_ledger_condition:
+                cond = self._updated_ledger_condition.wait()
+                if timeout is not None:
+                    await asyncio.wait_for(cond, timeout=timeout)
+                else:
+                    await cond
             return self._ledger
+        except asyncio.TimeoutError:
+            return TimeoutError("Timed out waiting for cache to update")
 
     def _writer_exception(self, shard_name, exc_info: ExceptionInfo):
         info = exc_info.restore()
@@ -834,6 +821,26 @@ def _writer_exception(self, shard_name, exc_info: ExceptionInfo):
             pass
         self._do_notify()
 
+    def _notify_updated_ledger(self, ledger: CacheLedger):
+        """
+        Called by the cache writer when it has updated the ledger.
+        """
+        was_finished = self._ledger.is_finished
+        self._ledger = ledger
+
+        if was_finished:
+            raise RuntimeError("Ledger was already finished")
+
+        if self._ledger.is_finished:
+            logger.info(f"Finalizing cache {self._cache_dir}...")
+            # guard against invalid state errors
+            if not self._finished_promise.done():
+                self._finished_promise.set_result(None)
+
+            self._cache_writer = None
+
+        self._do_notify()
+
     def _do_notify(self):
         async def _do_notify_async():
             async with self._updated_ledger_condition:
@@ -841,36 +848,8 @@ async def _do_notify_async():
 
         asyncio.create_task(_do_notify_async())
 
-    def current_ledger(self):
-        return self._ledger
-
-    def _finalize(self):
-        logger.info(f"Finalizing cache {self._cache_dir}...")
-
-        self._ledger.is_finished = True
-        self._finished_promise.set_result(None)
-
-        # notify metrics subscribers
-        self._do_notify()
-        self._cache_writer = None
-
-    def signal_backpressure(self, next_item_desired: Optional[int]):
-        # get the priority of the item we want
-        if next_item_desired is not None:
-            self.logger.debug(f"Signaling backpressure for {next_item_desired}")
-            # our priority function above is basically (batch_index, shard_index). We just ask we don't get more
-            # than one round of batches ahead
-            max_priority = (next_item_desired + 1) * len(self.source.shard_names)
-
-            for reader in self._shard_readers:
-                reader.set_max_dispatch_priority.remote(max_priority)
-        else:
-            self.logger.debug("Signaling no backpressure")
-            for reader in self._shard_readers:
-                reader.set_max_dispatch_priority.remote(None)
 
-
-def _get_builder_actor(cache_dir, input_shards, processor, cache_config=None, items_per_write=MIN_ITEMS_TO_WRITE):
+def _get_builder_actor(cache_dir, shard_source, processor, options=CacheOptions.default(), force_flush=False):
     name = f"lev_cache_manager::{cache_dir}"
     path_for_name = os.path.join(*os.path.split(cache_dir)[-2:])
     name_for_display = f"builder::{path_for_name}"
@@ -878,478 +857,445 @@ def _get_builder_actor(cache_dir, input_shards, processor, cache_config=None, it
     return _TreeStoreCacheBuilder.options(name=name, get_if_exists=True).remote(  # type: ignore
         name=name_for_display,
         cache_dir=cache_dir,
-        source=input_shards,
+        source=shard_source,
         processor=processor,
-        cache_config=cache_config,
-        min_items_to_write=items_per_write,
+        options=options,
+        force_flush=force_flush,
     )
 
 
-class TreeCache(AsyncDataset[T_co]):
-    ledger: Optional[CacheLedger]
-    _broker: Optional[ActorHandle]
-    # monitor_thread waits for new metrics and also periodically reloads the cache
-    _monitor_thread: Optional[threading.Thread]
-    _metrics_monitors: List[MetricsMonitor]
+#####
+# Core implementation starts below.
+#####
+# The main idea is to have a bunch of reader tasks that read batches, dispatch tokenization tasks, producing
+# a stream of tokenized batches. We then interleave these tokenized batches and write them to the cache.
+# The reader tasks are given a group of shards, which are implicitly concatenated together.
 
-    def __init__(
-        self,
-        cache_dir: str,
-        exemplar: T_co,
-        ledger: Optional[CacheLedger],
-        _broker,  # handle of _TreeStoreCacheBuilder
-    ):
-        self.cache_dir = cache_dir
-        self.ledger = ledger
-        self._was_already_finished = ledger is not None and ledger.is_finished
-        self._broker = _broker
-        self._exemplar = exemplar
+# This is still much slower than I would like but I haven't figured out why yet.
+# TODO:
+# - [ ] Profile the tokenization process more (see TIME comments)
+# - [ ] Try Ray's autoscaling actorpool if the issue is tokenization isn't fast enough
+#       https://github.com/ray-project/ray/blob/1bab09bf842edee51c3778be4cfb16f8b900d764/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py
+# - [ ] More observability into what's queued and how long work items take
 
-        self._metrics_monitors = []
-        name = os.path.join(*cache_dir.split("/")[-2:])
-        self.logger = pylogging.getLogger(f"TreeCache.{name}")
-        self._store_future: threading_Future[TreeStore] = threading_Future()
-        self._stop = False
-        # assert _broker is None
-
-        if self._broker is not None:
-            self._monitor_thread = threading.Thread(target=self._monitor_metrics, daemon=True)
-            self._monitor_thread.start()
-        else:
-            self._attempt_to_load_store()
-            assert self._store_future.done()
 
-    @property
-    def store(self) -> TreeStore[T_co]:
-        return self._store_future.result()
+@dataclass
+class _Batch:
+    """
+    A batch of data that has either been read or tokenized.
+    """
 
-    async def store_async(self) -> TreeStore[T_co]:
-        if self._broker is not None:
-            return await asyncio.wrap_future(self._store_future)
-        else:
-            return self.store
+    shard_name: str
+    row_indices: List[int]
+    payload: ray.ObjectRef
 
-    async def async_len(self) -> int:
-        if self._broker is not None:
-            self.await_finished()
 
-        return len(await self.store_async())
+@dataclass
+class _ShardFinished:
+    """
+    A message indicating that a shard has finished.
+    """
 
-    def __len__(self):
-        self.await_finished()
+    shard_name: str
+    total_rows: int
 
-        return len(self.store)
 
-    async def final_length_is_known(self) -> bool:
-        if self._broker is not None:
-            return await self._broker.is_finished.remote()
+_Message = _Batch | _ShardFinished
+"""
+A message that can be sent from a reader task to the writer task.
+"""
 
-        return True
+_TIME_BETWEEN_WRITES = 20.0  # seconds
+_MAX_WRITE_BATCHES = 1000
+_MIN_WRITE_BATCHES = 100
 
-    def is_finite(self) -> bool:
-        return True
 
-    async def current_len(self) -> int:
-        if not self._store_future.done():
-            return 0
+@ray.remote(num_cpus=1)
+def _core_writer_task(
+    parent,
+    cache_dir,
+    initial_ledger: CacheLedger,
+    source: ShardedDataSource,
+    processor,
+    force_flush: bool,
+):
+    """
+    This is the main task that processes the data and writes it to the cache.
 
-        return len(await self.store_async())
+    It chains together:
+        * 1 generator per shard group
+        * interleaving of the generators
+        * processing of the batches
+        * writing of the batches to the cache
+    """
+    logger.setLevel(DEFAULT_LOG_LEVEL)
+    logger.info("Starting writer task")
 
-    async def get_batch(self, indices: Sequence[int] | slice):
-        # this is tricky: we want to wait until either the cache is finished or we have the max index
-        if isinstance(indices, slice):
-            start, step, stop = await self._get_start_stops_async(indices)
-            await self._wait_for_len(max(stop, start))
-            indices = range(start, stop, step)
+    name = str(os.path.join(*cache_dir.split("/")[-2:]))
+    # append a small random number to the name to avoid collisions
+    name += f"::{random.randint(0, 1000)}"
 
-        max_index = max(indices)
-        await self._wait_for_len(max_index + 1)
+    def on_write(ledger):
+        ray.get(parent._notify_updated_ledger.remote(ledger))
 
-        return await self.store.get_batch(indices)
+    with log_failures_to(parent):
+        sharded_cache_writer = ray.remote(ShardedCacheWriter).remote(
+            cache_dir, initial_ledger, processor.output_exemplar, on_write=on_write
+        )
 
-    async def _wait_for_len(self, needed_len: int):
-        if self._broker is not None:
-            while needed_len > await self.current_len():
-                new_ledger: CacheLedger = await self._broker.updated_ledger.remote()
+        interleave: RayPrefetchQueue = RayPrefetchQueue(
+            lambda: _make_interleave(name, source, initial_ledger, processor),
+            4096,
+            producer_options={"num_cpus": 1, "name": f"{name}::interleave"},
+        )
 
-                if needed_len <= new_ledger.total_num_rows:
-                    break
+        total_time = Stopwatch()
+        loading_time = Stopwatch()
+        append_time = Stopwatch()
+        flush_time = Stopwatch()
+        flush_amortized_time = Stopwatch()
 
-                if new_ledger.is_finished:
-                    if needed_len >= new_ledger.total_num_rows:
-                        raise IndexError(
-                            f"Index {needed_len} out of bounds for cache of size {new_ledger.total_num_rows}"
-                        )
-                    break
-        else:
-            if needed_len > len(self.store):
-                raise IndexError(f"Index {needed_len} out of bounds for cache of size {len(self.store)}")
+        i = 0
+        batches_since_last_write = 0
+        time_of_last_write = time.time()
+        last_flush_future: Optional[ray.ObjectRef] = None
+        # start_of_last_flush = time_of_last_write
 
-    def _wait_for_len_sync(self, needed_len, timeout: Optional[float] = None):
-        time_in = time.time()
-        t_max = time_in + (timeout or 1e6)
-        if self._broker is not None:
-            while needed_len > len(self.store):
-                cur_time = time.time()
-                if cur_time > t_max:
-                    raise TimeoutError(f"Timed out waiting for cache to reach {needed_len}")
+        # for i, batch_box in enumerate(interleave):
+        while True:
+            with total_time:  # 0.014
                 try:
-                    new_ledger: CacheLedger = ray.get(
-                        self._broker.updated_ledger.remote(), timeout=max(t_max - cur_time, 10)
-                    )
-                except TimeoutError:
-                    continue
-
-                if needed_len <= new_ledger.total_num_rows:
+                    cur_time = time.time()
+                    time_since_last_write = cur_time - time_of_last_write
+                    remaining_time = _TIME_BETWEEN_WRITES - time_since_last_write
+
+                    if batches_since_last_write > 0:
+                        with flush_amortized_time:
+                            if remaining_time <= 0 or batches_since_last_write >= _MAX_WRITE_BATCHES or force_flush:
+                                with flush_time:
+                                    # TODO: don't block?
+                                    if last_flush_future:
+                                        ray.get(last_flush_future)
+                                        # print(
+                                        #     f"Flushed {batches_since_last_write} batches in"
+                                        #     f" {time.time() - start_of_last_flush} seconds"
+                                        # )
+                                    last_flush_future = sharded_cache_writer.flush.remote()
+                                    # start_of_last_flush = time.time()
+                                    batches_since_last_write = 0
+                                    time_of_last_write = time.time()
+                                    continue
+                    else:
+                        remaining_time = _TIME_BETWEEN_WRITES
+
+                    with loading_time:
+                        try:
+                            message = interleave.get_next(timeout=max(remaining_time, 0.1))
+                        except QueueEmpty:
+                            logger.info("Writer running ahead of reader.")
+                            continue
+
+                    with append_time:
+                        match message:
+                            case _Batch(shard, _, payload):
+                                # TODO: ensure indices are what we expect
+                                sharded_cache_writer.write_batch.remote(shard, payload)
+                                batches_since_last_write += 1
+                                i += 1
+                            case _ShardFinished(shard, total_rows):
+                                ray.get(sharded_cache_writer.finish_shard.remote(shard, total_rows))
+                            case _:
+                                raise AssertionError(f"Unexpected message type {type(message)}")
+
+                    # if i % 1000 == 0:
+                    #     print(
+                    #         f"Processed {i} batches: {loading_time.average()}s load,"
+                    #         f" {append_time.average()}s append, {flush_time.average()}s flush blocked, "
+                    #         f"{flush_amortized_time.average()}s amortized flush, "
+                    #         f"{total_time.average()}s total"
+                    #     )
+                except StopIteration:
+                    logger.info("Finished all shards")
                     break
+                except Exception as e:
+                    logger.exception("Error while processing batch")
+                    raise e
 
-                if new_ledger.is_finished:
-                    if needed_len >= new_ledger.total_num_rows:
-                        raise IndexError(
-                            f"Index {needed_len} out of bounds for cache of size {new_ledger.total_num_rows}"
-                        )
-                    break
-        else:
-            if needed_len > len(self.store):
-                raise IndexError(f"Index {needed_len} out of bounds for cache of size {len(self.store)}")
-
-    @staticmethod
-    def load(cache_dir: str, exemplar: T) -> "TreeCache":
-        """Loads a cache from disk or an object store. Raises FileNotFoundError if the cache doesn't exist"""
-        logger.info(f"Loading cache from {cache_dir}")
-        ledger = _load_cache_ledger(cache_dir)
-        if not ledger.is_finished:
-            raise FileNotFoundError(f"Cache at {cache_dir} is not finished. Use build_or_load to build it.")
-        return TreeCache(cache_dir, exemplar, ledger, None)
-
-    @staticmethod
-    def build_or_load(
-        cache_dir: str,
-        shard_source: ShardedDataSource[T],
-        processor: BatchProcessor[T, U],
-        cache_config: Optional[Dict[str, Any]] = None,
-        items_per_write: int = MIN_ITEMS_TO_WRITE,
-    ) -> "TreeCache[U]":
-        try:
-            return TreeCache.load(cache_dir, processor.output_exemplar)
-        except FileNotFoundError:
-            broker = _get_builder_actor(
-                cache_dir=cache_dir,
-                input_shards=shard_source,
-                processor=processor,
-                cache_config=cache_config,
-                items_per_write=items_per_write,
-            )
-            return TreeCache(cache_dir=cache_dir, exemplar=processor.output_exemplar, ledger=None, _broker=broker)
-
-    def finished_sentinel(self):
-        """Returns a Ray-awaitable object that will be set when the cache is finished"""
-        if self._broker is None:
-            return ray.remote(num_cpus=0)(lambda: None).remote()
-        else:
-            return self._broker.finished_sentinel.remote()
-
-    @property
-    def is_finished(self):
-        if self._broker is None:
-            return True
-        else:
-            return ray.get(self._broker.is_finished.remote())
-
-    def __getitem__(self, item):
-        if isinstance(item, slice):
-            start, step, stop = self._get_start_stops(item)
-            # TODO: wait for store to be set
-            return self.store[start:stop:step]
-        else:
-            if item < 0:
-                item += len(self)
-            if item < 0 or item >= len(self):
-                raise IndexError(f"Index {item} out of bounds for cache of size {len(self)}")
-            return self.store[item]
-
-    def get_batch_sync(self, indices_or_slice, *, timeout: Optional[float] = None):
-        store = self.store
-        if isinstance(indices_or_slice, slice):
-            start, step, stop = self._get_start_stops(indices_or_slice)
-            indices_or_slice = range(start, stop, step)
-
-        max_index = max(indices_or_slice)
-
-        self._wait_for_len_sync(max_index + 1, timeout=timeout)
-
-        return store.get_batch_sync(indices_or_slice)
-
-    def _get_start_stops(self, slice):
-        start = slice.start or 0
-        if slice.stop is None:
-            stop = len(self)
-        elif slice.stop < 0:
-            stop = len(self) + slice.stop
-        else:
-            stop = slice.stop
-        if start < 0:
-            start = len(self) + slice.start
-        step = slice.step or 1
-        return start, step, stop
-
-    async def _get_start_stops_async(self, slice):
-        start = slice.start or 0
-        if slice.stop is None:
-            stop = await self.async_len()
-        elif slice.stop < 0:
-            stop = (await self.async_len()) + slice.stop
-        else:
-            stop = slice.stop
-        if start < 0:
-            start = (await self.async_len()) + slice.start
+        sharded_cache_writer.finish.remote()
 
-        step = slice.step or 1
-        return start, step, stop
+        out = sharded_cache_writer.get_ledger.remote()
+        return out
 
-    def await_finished(self, timeout: Optional[float] = None):
-        if self._broker is None:
-            return
-        x = ray.get(self.finished_sentinel(), timeout=timeout)
-        self._attempt_to_load_store()
-        return x
 
-    async def finished(self):
-        if self._broker is None:
-            return
-        x = await self.finished_sentinel()
-        # TODO: make an async version of this
-        self._attempt_to_load_store()
-        return x
-
-    def _attempt_to_load_store(self):
-        if self._store_future.done():
-            return
-
-        try:
-            store = TreeStore.open(self._exemplar, self.cache_dir, mode="r")
-        except FileNotFoundError:
-            logger.error(f"Cache at {self.cache_dir} not found.")
-            assert self._broker is not None
-            ledger = ray.get(self._broker.current_ledger.remote())
-            metrics = _ledger_to_metrics(ledger)
-            if metrics.rows_finished == 0 and metrics.is_finished:
-                # this means we built an empty cache. go with it
-                store = TreeStore.open(self._exemplar, f"memory://{self.cache_dir}", mode="a")
-            else:
-                raise
-        try:
-            self._store_future.set_result(store)
-        except concurrent.futures.InvalidStateError:
-            pass
-
-    def attach_metrics_monitor(self, monitor: MetricsMonitor):
-        if self._broker is None:
-            logger.warning("Cannot attach metrics monitor to finished cache.")
-            # TODO: decide what to do about attaching if the cache is already finished
-            # maybe get the final metrics?
-            return
+def _interleave_shards(readers: Sequence[RayPrefetchQueue], first_index: int) -> Iterator[T]:  # _Message
+    """
+    Interleaves the results of multiple iterators. To support resume,
+    we need to be able to start from not the "first" iterator.
 
-        self._metrics_monitors.append(monitor)
+    Args:
+        readers: A list of iterators
+        first_index: The index of the first iterator to start from. We use this to support resuming.
+    """
 
-    def _monitor_metrics(self):
-        while not self._stop:
-            try:
+    finished: set[int] = set()
+    total = 0
+    while len(finished) < len(readers):
+        for i in range(first_index, len(readers)):
+            reader = readers[i]
+            if i not in finished:
                 try:
-                    ledger = ray.get(self._broker.updated_ledger.remote(), timeout=10.0)
-                    metrics = _ledger_to_metrics(ledger)
-                    for monitor in self._metrics_monitors:
-                        monitor(metrics)
-                    if metrics.is_finished:
-                        break
-                except TimeoutError:
-                    pass
+                    message = reader.get_next()
+                    total += 1
+                    yield message
+                except StopIteration:
+                    finished.add(i)
                 except Exception as e:
-                    if str(e).startswith("Failed to submit task to actor"):
-                        logger.warning("Cache builder actor is gone. Stopping monitoring.")
-                        break
-                try:
-                    self._attempt_to_load_store()
-                except FileNotFoundError:
-                    pass
-            except Exception as e:
-                if str(e).startswith("Failed to submit task to actor"):
-                    logger.warning("Cache builder actor is gone. Stopping monitoring.")
-                    break
-                else:
-                    self.logger.exception("Error while reading metrics from shard cache.")
+                    logger.exception(f"Error while processing group {i}")
                     raise e
 
+        first_index = 0
 
-def _ledger_to_metrics(ledger: CacheLedger) -> InProgressCacheMetrics:
-    return InProgressCacheMetrics(
-        rows_finished=ledger.total_num_rows,
-        is_finished=ledger.is_finished,
-        # shard_rows=ledger.shard_rows,
-        # finished_shards=ledger.finished_shards,
-        field_counts=ledger.field_counts,
-    )
+    logger.info(f"Finished all shards, got {total} batches")
 
 
-class GroupRoundRobinBuffer(Generic[T]):
+def _assign_shards_to_groups(shards: Sequence[_ShardStatus], num_groups: int) -> list["_ShardGroup"]:
     """
-    A buffer that holds items from multiple groups and returns them in a round-robin fashion.
-    The groups need not have the same number of items. If a group is exhausted, it is removed from the rotation.
+    Assigns shards to groups in a round-robin fashion.
     """
+    groups: list[list] = [[] for _ in range(num_groups)]
+    for i, shard in enumerate(shards):
+        groups[i % num_groups].append(shard)
+    return [_ShardGroup(group) for group in groups]
 
-    def __init__(self, groups: Sequence[str]):
-        self.groups = groups
-        self._current_group = 0
-        self.buffers: dict[str, list[tuple[int, T]]] = {group: [] for group in groups}
-        self._remaining_groups = set(groups)
-        self._totals_written: dict[str, int] = {group: 0 for group in groups}
-        self._totals_expected: dict[str, Optional[int]] = {group: None for group in groups}
 
-    def __len__(self):
-        return sum(len(buffer) for buffer in self.buffers.values())
+def _randomize_shards(shards: Sequence[T], seed: int) -> list[T]:
+    prng = random.Random(seed)
+    shuffled = list(shards)
+    prng.shuffle(shuffled)
+    return shuffled
 
-    def append_to_group(self, group: str, item_serial: int, item: T):
-        if group not in self.groups:
-            raise ValueError(f"Group {group} not in {self.groups}")
 
-        if group not in self._remaining_groups:
-            raise ValueError(f"Group {group} already finished")
+class _ShardGroup:
+    """
+    Given a group of shards and a list of statuses, implicitly concatenates the shards and reads from them.
 
-        logger.debug(f"Appending item {item_serial} to {group}")
+    This class mostly exists for resuming: we want to be able to start from the last shard we were working on.
+    """
 
-        heapq.heappush(self.buffers[group], (item_serial, item))
+    def __init__(self, group: list[_ShardStatus]):
+        self.shards = group
+        self.total_rows_committed, _all_finished = self._impute_total_rows_committed_and_check_invariants()
+
+    def _impute_total_rows_committed_and_check_invariants(self):
+        # we also want to ensure that we haven't started any shards until we've finished the previous ones
+        total_committed = 0
+        last_shard_name = None
+        last_was_finished = True
+        all_finished = True
+
+        for status in self.shards:
+            shard_name = status.shard_name
+            if not last_was_finished and status.num_rows_committed > 0:
+                raise ValueError(
+                    f"Shard {shard_name} has rows committed but previous shard in group {last_shard_name} "
+                    "is not finished. Something about the cache configuration has changed: either the "
+                    "number/order of shards, the shard shuffle random seed, or the number of groups."
+                )
+            total_committed += status.num_rows_committed
+            if not status.is_finished:
+                all_finished = False
+            last_was_finished = status.is_finished
+            last_shard_name = shard_name
 
-    def group_total_known(self, group: str, total: int):
-        if group not in self.groups:
-            raise ValueError(f"Group {group} not in {self.groups}")
+        return total_committed, all_finished
 
-        if group not in self._remaining_groups:
-            raise ValueError(f"Group {group} already finished: {total} vs {self._totals_expected[group]}")
 
-        self._totals_expected[group] = total
+def _make_interleave(name: str, source: ShardedDataSource, initial_ledger: CacheLedger, processor: BatchProcessor):
+    """
+    Given a list of ShardStatus objects and sources, creates an interleaving generator
+    that reads from shards and tokenizes them in parallel.
 
-        if self._totals_written[group] == total:
-            assert len(self.buffers[group]) == 0
-            self._remaining_groups.remove(group)
-        elif self._totals_written[group] > total:
-            raise ValueError(f"Group {group} has written more than expected: {self._totals_written[group]} > {total}")
+    We use ShardStatus objects to track the progress of each shard. If we're preempted, we can resume
+    from the last shard we were working on. This function starts each shard at the last committed row
+    and starts interleaving from the next shard (i.e. the one with the fewest rows that isn't finished).
+    """
+    logger.setLevel(DEFAULT_LOG_LEVEL)
+    statuses = _get_shard_statuses(initial_ledger, source)
 
-    def is_finished(self):
-        return len(self._remaining_groups) == 0
+    options = initial_ledger.metadata.options
 
-    def pop(self) -> Optional[tuple[str, T]]:
-        group = self._next_group_to_read_from()
-        if group is None:
-            return None
+    unfinished_shards = _check_current_shard_progress(statuses)
 
-        if len(self.buffers[group]) == 0:
-            return None
+    if not unfinished_shards:
+        logger.info("All shards finished. Nothing to do.")
+        return
 
-        cur_serial, item = self.buffers[group][0]
+    group_names, groups = _randomize_and_group_shards(name, options, statuses)
 
-        # logger.debug(
-        #     f"group: {group}, cur_serial: {cur_serial}, totals_written: {self._totals_written[group]},"
-        #     f" totals_expected: {self._totals_expected.get(group)}"
-        # )
+    logger.warning(f"Starting cache build with {len(statuses)} shards, in {len(groups)} groups")
 
-        if cur_serial > self._totals_written[group]:
-            return None
-        elif cur_serial < self._totals_written[group]:
-            raise ValueError(f"Duplicate serial {cur_serial} for group {group}")
+    process_task = _mk_process_task(processor)
+    processor_ref = ray.put(processor)
 
-        heapq.heappop(self.buffers[group])
-        logger.debug(f"Read item {cur_serial} from {group}")
+    def _make_generator_fn(group: _ShardGroup):
+        def generator():
+            pylogging.basicConfig(level=DEFAULT_LOG_LEVEL, format=LOG_FORMAT)
+            for message in _shard_reader_generator(source, group, options.batch_size):
+                match message:
+                    case _Batch():
+                        # processed = ray.put(process_task(ray.get(message.payload)))
+                        processed = process_task.remote(processor_ref, message.payload)
+                        yield dataclasses.replace(message, payload=processed)
+                    case _ShardFinished():
+                        yield message
+                    case _:
+                        raise AssertionError(f"Unexpected message type {type(message)}")
 
-        self._totals_written[group] += 1
+        return generator
 
-        if self._totals_written[group] == self._totals_expected[group]:
-            assert len(self.buffers[group]) == 0
-            assert group in self._remaining_groups
-            self._remaining_groups.remove(group)
+    generator_fns = [_make_generator_fn(group) for group in groups]
 
-        self._current_group = (self._current_group + 1) % len(self.groups)
+    readers = [
+        RayPrefetchQueue(fn, 128, producer_options=dict(name=name, scheduling_strategy="SPREAD"))
+        for name, fn in zip(group_names, generator_fns)
+    ]
 
-        return group, item
+    # then figure out the first shard to start from. This is the first unfinished shard with the minimum number of rows
+    first_group_to_start = min(
+        range(len(groups)),
+        key=lambda i: groups[i].total_rows_committed,
+    )
 
-    def drain(self) -> Iterator[tuple[str, T]]:
-        while True:
-            item = self.pop()
-            if item is None:
-                break
-            yield item
+    yield from _interleave_shards(readers, first_group_to_start)
 
-    def _next_group_to_read_from(self):
-        """
-        Returns the next group to read from. This is always the group with the least that is not finished.
-        """
-        if len(self._remaining_groups) == 0:
-            return None
 
-        # careful: this is only correct if self._current_group is correct. whenever we fast forward, we have to
-        # recompute it
-        while True:
-            group = self.groups[self._current_group]
-            if group not in self._remaining_groups:
-                assert self._totals_written[group] == self._totals_expected[group]
-                assert len(self.buffers[group]) == 0
-                self._current_group = (self._current_group + 1) % len(self.groups)
-            else:
-                break
-        return group
+def _check_current_shard_progress(statuses):
+    unfinished_shards: list[_ShardStatus] = []
+    shards_with_progress: dict[str, int] = {}
+    for status in statuses:
+        if not status.is_finished:
+            unfinished_shards.append(status)
+        if status.num_rows_committed > 0:
+            shards_with_progress[status.shard_name] = status.num_rows_committed
+    if unfinished_shards and shards_with_progress:
+        formatted = ", ".join(f"{k}: {v}" for k, v in shards_with_progress.items())
+        logger.info(f"Resuming from shards with progress: {formatted}")
+    return unfinished_shards
 
-    def fast_forward(self, group, num_rows):
-        """
-        Fast forwards the buffer for a group to a certain number of rows. This sets the "next" item to be the
-        num_rows-th item.
-        """
-        if group not in self.groups:
-            raise ValueError(f"Group {group} not in {self.groups}")
 
-        if self._totals_written[group] != 0:
-            raise ValueError(f"Group {group} already written to: {self._totals_written[group]}")
+def _randomize_and_group_shards(name, options, statuses):
+    if options.shard_order_randomization_key is not None:
+        seed = options.shard_order_randomization_key
+        logger.info(f"Randomizing shard order with seed {seed}")
+        statuses = _randomize_shards(statuses, seed)
 
-        self._totals_written[group] = num_rows
+    num_groups = min(
+        options.num_shard_groups if options.num_shard_groups is not None else len(statuses), len(statuses)
+    )
+    if num_groups == 1:
+        group_names = [f"generator::{name}::all_shards"]
+    elif len(statuses) == num_groups:
+        group_names = [f"generator::{name}::{status.shard_name}" for status in statuses]
+    else:
+        group_names = [f"generator::{name}::group_{i}" for i in range(num_groups)]
 
-        self._fix_current_group()
+    groups = _assign_shards_to_groups(statuses, num_groups)
+    return group_names, groups
 
-    def _fix_current_group(self):
-        # This is always the minimum total written group that is not finished
-        self._current_group = 0
-        min_total = None
 
-        for i, group in enumerate(self.groups):
-            if group not in self._remaining_groups:
-                continue
-            total = self._totals_written[group]
-            if min_total is None or total < min_total:
-                min_total = total
-                self._current_group = i
+def _shard_reader_generator(
+    shard_source: ShardedDataSource[T], group: _ShardGroup, batch_size: int
+) -> Iterator[_Message]:
+    """
+    Given a group of shards, implicitly concatenates the shards and reads from them.
+    """
+    for status in group.shards:
+        if status.is_finished:
+            logger.info(f"Skipping finished shard {status.shard_name}")
+            continue
+        start_row = status.num_rows_committed
+        logger.info(f"Opening shard {status.shard_name} at row {start_row}")
+        shard_iter = shard_source.open_shard_at_row(status.shard_name, start_row)
+
+        batch = []
+        batch_idxes = []
+        row_idx = start_row
+        for row in shard_iter:
+            batch.append(row)
+            batch_idxes.append(row_idx)
+            row_idx += 1
+
+            if len(batch) == batch_size:
+                yield _Batch(status.shard_name, batch_idxes, ray.put(batch))
+                batch = []
+                batch_idxes = []
+
+        if len(batch) > 0:
+            yield _Batch(status.shard_name, batch_idxes, ray.put(batch))
+
+        logger.info(f"Finished generating shard {status.shard_name} with {row_idx} rows")
+        yield _ShardFinished(status.shard_name, row_idx)
+
+
+def _mk_process_task(processor: BatchProcessor[T, U]) -> RemoteFunction:
+    """
+    Returns a Ray remote function that processes a batch of data. Basically it takes the resources from
+    the processor and wraps its call
+    """
+    # processor_ref = ray.put(processor)
+    # exemplar = {
+    #     "input_ids": np.random.randint(0, 100, size=(4096,))
+    # }
 
-    def next_missing_item_index(self):
-        """
-        Returns the index of the next item that is not in the buffer
-        (i.e. what's stopping us from yielding the next item).
-        """
-        if len(self._remaining_groups) == 0:
-            return None
+    @ray.remote(num_cpus=processor.num_cpus, num_gpus=processor.num_gpus, resources=processor.resources)
+    def process_task(processor, batch_payload):
+        try:
+            result = processor(batch_payload)  # TIME: 0.03 seconds
+            result = _canonicalize_batch(result)  # type: ignore
+            logger.debug("Finished processing batch")
+            return result
+        except Exception as e:
+            logger.exception("Error while processing batch")
+            raise e
+        finally:
+            pass
 
-        group = self.groups[self._current_group]
-        if group not in self._remaining_groups:
-            self._fix_current_group()
-            return self.next_missing_item_index()
+    return process_task
 
-        if len(self.buffers[group]) == 0:
-            return self._totals_written[group]
 
-        cur_serial, _ = self.buffers[group][0]
+def _canonicalize_batch(batch: Union[dict, List[dict]]) -> List[dict]:
+    if isinstance(batch, pa.RecordBatch):
+        batch = dict_from_record_batch(batch)
 
-        if cur_serial > self._totals_written[group]:
-            return self._totals_written[group]
-        elif cur_serial < self._totals_written[group]:
-            raise ValueError(f"Duplicate serial {cur_serial} for group {group}")
+    if isinstance(batch, dict):
+        return _to_list_of_dicts(batch)
+    else:
+        return batch
 
-        return None
+
+def _to_list_of_dicts(batch: dict) -> List[dict]:
+    """
+    Convert a batch of dictionaries to a list of dictionaries, suitable for writing to a cache.
+    """
+    keys = list(batch.keys())
+    values = list(batch.values())
+    num_rows = len(values[0])
+    return [{key: values[i][j] for i, key in enumerate(keys)} for j in range(num_rows)]
+
+
+def _ledger_to_metrics(ledger: CacheLedger) -> InProgressCacheMetrics:
+    # TODO: remove this
+    return InProgressCacheMetrics(
+        rows_finished=ledger.total_num_rows,
+        is_finished=ledger.is_finished,
+        # shard_rows=ledger.shard_rows,
+        shards_finished=len(ledger.finished_shards),
+        field_counts=ledger.field_counts,
+    )
 
 
-def div_round_up(x, y):
-    return (x + y - 1) // y
+def _get_shard_statuses(ledger: CacheLedger, source: ShardedDataSource):
+    return [
+        _ShardStatus(name, ledger.shard_rows.get(name, 0), name in ledger.finished_shards)
+        for name in source.shard_names
+    ]
diff --git a/src/levanter/store/jagged_array.py b/src/levanter/store/jagged_array.py
index 8b3a26a54..b236641c9 100644
--- a/src/levanter/store/jagged_array.py
+++ b/src/levanter/store/jagged_array.py
@@ -14,7 +14,7 @@
 from levanter.utils.thread_utils import future_from_value
 
 
-# zarr suggests 1MB chunk size (in bytes, but whatever)
+# zarr suggests 1MB chunk size
 # at 4 bytes this is 256k elements
 DEFAULT_CHUNK_SIZE = 256 * 1024
 DEFAULT_WRITE_CHUNK_SIZE = DEFAULT_CHUNK_SIZE * 512
@@ -38,9 +38,14 @@ class JaggedArrayStore:
     data: ts.TensorStore
     shapes: Optional[ts.TensorStore]  # (len(offsets), len(data.shape)-1)
     item_rank: int = 1
+    _cache_metadata: bool = False
+    _cached_num_rows: Optional[int] = None
+    _cached_data_size: Optional[int] = None
 
     @staticmethod
-    async def open_async(path: Optional[str], *, mode="a", item_rank=1, dtype) -> "JaggedArrayStore":
+    async def open_async(
+        path: Optional[str], *, mode="a", item_rank=1, dtype, cache_metadata: bool = False
+    ) -> "JaggedArrayStore":
         offset_path = _extend_path(path, "offsets")
         offsets = _ts_open_async(offset_path, jnp.int64, [1], mode=mode)
 
@@ -53,10 +58,12 @@ async def open_async(path: Optional[str], *, mode="a", item_rank=1, dtype) -> "J
         else:
             shapes = None
 
-        return JaggedArrayStore(await offsets, await data, await shapes if shapes is not None else None, item_rank)
+        return JaggedArrayStore(
+            await offsets, await data, await shapes if shapes is not None else None, item_rank, cache_metadata
+        )
 
     @staticmethod
-    def open(path: Optional[str], *, mode="a", item_rank=1, dtype) -> "JaggedArrayStore":
+    def open(path: Optional[str], *, mode="a", item_rank=1, dtype, cache_metadata: bool = False) -> "JaggedArrayStore":
         offset_path = _extend_path(path, "offsets")
         offsets = _ts_open_sync(offset_path, jnp.int64, [1], mode=mode)
 
@@ -69,18 +76,42 @@ def open(path: Optional[str], *, mode="a", item_rank=1, dtype) -> "JaggedArraySt
         else:
             shapes = None
 
-        return JaggedArrayStore(offsets, data, shapes, item_rank)
+        return JaggedArrayStore(offsets, data, shapes, item_rank, cache_metadata)
 
     @property
     def num_rows(self):
-        return int(self.offsets[0].read().result())
+        if self._cached_num_rows is not None:
+            return self._cached_num_rows
+        result = int(self.offsets[0].read().result())
+        if self._cache_metadata:
+            self._cached_num_rows = result
+        return result
 
     async def num_rows_async(self):
-        return int(await self.offsets[0].read())
+        if self._cached_num_rows is not None:
+            return self._cached_num_rows
+        result = int(await self.offsets[0].read())
+        if self._cache_metadata:
+            self._cached_num_rows = result
+        return result
 
     @property
     def data_size(self):
-        return int(self.offsets[self.num_rows].read().result())
+        # return int(self.offsets[self.num_rows].read().result())
+        if self._cached_data_size is not None:
+            return self._cached_data_size
+        result = int(self.offsets[self.num_rows].read().result())
+        if self._cache_metadata:
+            self._cached_data_size = result
+        return result
+
+    async def data_size_async(self):
+        if self._cached_data_size is not None:
+            return self._cached_data_size
+        result = int(await self.offsets[self.num_rows].read())
+        if self._cache_metadata:
+            self._cached_data_size = result
+        return result
 
     async def append_async(self, data: jax.Array):
         await self.extend_async([data])
@@ -122,6 +153,10 @@ async def trim_to_size_async(self, size: int):
         await data_fut
         await offsets_fut
 
+        if self._cache_metadata:
+            self._cached_num_rows = size
+            self._cached_data_size = new_max
+
     def trim_to_size(self, size: int):
         if size >= self.num_rows:
             return
@@ -151,6 +186,10 @@ def trim_to_size(self, size: int):
         if shape_fut is not None:
             shape_fut.result()
 
+        if self._cache_metadata:
+            self._cached_num_rows = size
+            self._cached_data_size = new_max
+
     async def extend_async(self, arrays: Sequence[jax.Array]):
         data, new_offsets, shapes = self._prepare_batch(arrays)
 
@@ -165,13 +204,14 @@ async def extend_async(self, arrays: Sequence[jax.Array]):
         ]
         if self.shapes is not None:
             write_tasks.append(self.shapes[num_rows : num_rows + num_added].write(shapes))
-
         await asyncio.gather(*write_tasks)
 
         # Update num_rows
-        int(self.offsets[self.num_rows].read().result())
         await self.offsets[0].write(num_rows + len(arrays))
-        # print("done")
+
+        if self._cache_metadata:
+            self._cached_num_rows = num_rows + len(arrays)
+            self._cached_data_size = current_data_size + len(data)
 
     def extend(self, arrays: Sequence[jax.Array]):
         data, new_offsets, shapes = self._prepare_batch(arrays)
@@ -187,12 +227,16 @@ def extend(self, arrays: Sequence[jax.Array]):
         if self.shapes is not None:
             write_tasks.append(self.shapes[num_rows : num_rows + num_added].write(shapes))
 
+        # Update num_rows. We want to make sure this comes after the other data is committed to avoid a race
         for task in write_tasks:
             task.result()
 
-        # Update num_rows. We want to make sure this comes after the other data is committed to avoid a race
         self.offsets[0].write(num_rows + len(arrays)).result()
 
+        if self._cache_metadata:
+            self._cached_num_rows = num_rows + len(arrays)
+            self._cached_data_size = current_data_size + len(data)
+
     def _prepare_batch(self, arrays):
         if self.shapes is not None:
             for data in arrays:
diff --git a/src/levanter/store/tree_store.py b/src/levanter/store/tree_store.py
index 0b1e93bff..cd29e5a4c 100644
--- a/src/levanter/store/tree_store.py
+++ b/src/levanter/store/tree_store.py
@@ -1,6 +1,6 @@
 import asyncio
 import os
-from typing import Generic, List, TypeVar
+from typing import Generic, List, Sequence, TypeVar
 
 import jax
 import jax.numpy as jnp
@@ -50,17 +50,17 @@ def __init__(self, tree, path: str, mode: str):
         self.tree = tree
 
     @staticmethod
-    def open(exemplar: T, path: str, *, mode="a") -> "TreeStore":
+    def open(exemplar: T, path: str, *, mode="a", cache_metadata: bool = False) -> "TreeStore":
         """
         Open a TreeStoreBuilder from a file.
         """
-        tree = _construct_builder_tree(exemplar, path, mode)
+        tree = _construct_builder_tree(exemplar, path, mode, cache_metadata)
         return TreeStore(tree, path, mode)
 
     def append(self, ex: T):
         return self.extend([ex])
 
-    def extend(self, batch: List[T]):
+    def extend(self, batch: Sequence[T]):
         """
         Append a batch of data to the store.
         """
@@ -168,12 +168,18 @@ def get_batch_sync(self, indices) -> List[T]:
         return out
 
 
-def _construct_builder_tree(exemplar, path, mode):
+def _construct_builder_tree(exemplar, path, mode, cache_metadata):
     def open_builder(tree_path, item):
         item = np.asarray(item)
         rank = item.ndim
         render_tree_path = "/".join(_render_path_elem(x) for x in tree_path)
-        return JaggedArrayStore.open(os.path.join(path, render_tree_path), mode=mode, item_rank=rank, dtype=item.dtype)
+        return JaggedArrayStore.open(
+            os.path.join(path, render_tree_path),
+            mode=mode,
+            item_rank=rank,
+            dtype=item.dtype,
+            cache_metadata=cache_metadata,
+        )
 
     return jtu.tree_map_with_path(open_builder, exemplar, is_leaf=heuristic_is_leaf)
 
diff --git a/src/levanter/utils/py_utils.py b/src/levanter/utils/py_utils.py
index a796dd6af..8431e1c3a 100644
--- a/src/levanter/utils/py_utils.py
+++ b/src/levanter/utils/py_utils.py
@@ -1,5 +1,6 @@
 import os
 import sys
+import time
 from dataclasses import dataclass
 from typing import Callable, TypeVar
 
@@ -181,3 +182,37 @@ def actual_sizeof(obj):
                 need_to_see.extend(obj)
         objects = need_to_see
     return size
+
+
+class Stopwatch:
+    """Resumable stop watch for tracking time per call"""
+
+    def __init__(self):
+        self._start_time = time.time()
+        self._elapsed = 0.0
+        self._n = 0
+
+    def start(self):
+        self._start_time = time.time()
+        self._n += 1
+
+    def stop(self):
+        self._elapsed += time.time() - self._start_time
+
+    def reset(self):
+        self._elapsed = 0.0
+
+    def elapsed(self):
+        return self._elapsed
+
+    def average(self):
+        if self._n == 0:
+            return 0.0
+        return self._elapsed / self._n
+
+    def __enter__(self):
+        self.start()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stop()
diff --git a/src/levanter/utils/ray_utils.py b/src/levanter/utils/ray_utils.py
index 8a299720e..40c76b614 100644
--- a/src/levanter/utils/ray_utils.py
+++ b/src/levanter/utils/ray_utils.py
@@ -1,6 +1,7 @@
 import contextlib
 import dataclasses
 import logging
+import logging as pylogging
 import sys
 from dataclasses import dataclass
 from typing import Optional
@@ -52,6 +53,9 @@ class RefBox:
 
     ref: ray.ObjectRef
 
+    def get(self):
+        return ray.get(self.ref)
+
 
 class DoneSentinel:
     pass
@@ -78,7 +82,7 @@ def current_actor_handle() -> ray.actor.ActorHandle:
 class SnitchRecipient:
     logger: logging.Logger
 
-    def _child_failed(self, child: ray.actor.ActorHandle, exception: ExceptionInfo):
+    def _child_failed(self, child: ray.actor.ActorHandle | str | None, exception: ExceptionInfo):
         info = exception.restore()
         self.logger.error(f"Child {child} failed with exception {info[1]}", exc_info=info)
         exception.reraise()
@@ -90,6 +94,40 @@ def log_failures_to(parent, suppress=False):
     try:
         yield
     except Exception as e:
-        parent._child_failed.remote(current_actor_handle(), ser_exc_info(e))
+        try:
+            handle = current_actor_handle()
+        except RuntimeError:
+            handle = ray.runtime_context.get_runtime_context().get_task_id()
+
+        parent._child_failed.remote(handle, ser_exc_info(e))
         if not suppress:
             raise e
+
+
+DEFAULT_LOG_LEVEL = logging.INFO
+LOG_FORMAT = "%(asctime)s %(levelname)s: %(message)s"
+
+
+@ray.remote
+class StopwatchActor:
+    def __init__(self):
+        pylogging.basicConfig(level=DEFAULT_LOG_LEVEL, format=LOG_FORMAT)
+        self._logger = pylogging.getLogger("StopwatchActor")
+        self._times_per = {}
+        self._counts_per = {}
+        self._total = 0
+
+    def measure(self, name: str, time: float):
+        self._times_per[name] = self._times_per.get(name, 0) + time
+        self._counts_per[name] = self._counts_per.get(name, 0) + 1
+        self._total += 1
+
+        if self._total % 1000 == 0:
+            for name, time in self._times_per.items():
+                self._logger.info(f"{name}: {time / self._counts_per[name]}")
+
+    def get(self, name: str):
+        return self._times_per.get(name, 0), self._counts_per.get(name, 0)
+
+    def average(self, name: str):
+        return self._times_per.get(name, 0) / self._counts_per.get(name, 1)
diff --git a/tests/test_audio.py b/tests/test_audio.py
index 8d3015431..3ad9b09b3 100644
--- a/tests/test_audio.py
+++ b/tests/test_audio.py
@@ -80,3 +80,13 @@ def test_hf_audio_serial_cache():
             assert ex["input_features"].shape == (80, 3000), ex["input_features"].shape
             assert ex["input_ids"].shape == (1024,), ex["input_ids"].shape
             assert ex["attention_mask"].shape == (1024,), ex["attention_mask"].shape
+
+
+@skip_if_no_soundlibs
+@skip_if_hf_model_not_accessible("openai/whisper-tiny")
+def test_metadata_works():
+    processor = AutoProcessor.from_pretrained("openai/whisper-tiny")
+    tokenizer = AutoTokenizer.from_pretrained("openai/whisper-tiny")
+    batch_processor = BatchAudioProcessor(processor, tokenizer)
+    # test this doesn't throw
+    assert len(batch_processor.metadata)
diff --git a/tests/test_jagged_array.py b/tests/test_jagged_array.py
index 24ed24b08..c89a2c625 100644
--- a/tests/test_jagged_array.py
+++ b/tests/test_jagged_array.py
@@ -10,9 +10,10 @@
 
 
 class TestJaggedArrayStore:
-    def test_append_and_get(self):
+    @pytest.mark.parametrize("cache_metadata", [True, False])
+    def test_append_and_get(self, cache_metadata):
         with tempfile.TemporaryDirectory() as tmpdir:
-            builder = JaggedArrayStore.open(tmpdir, item_rank=2, dtype=jnp.float32)
+            builder = JaggedArrayStore.open(tmpdir, item_rank=2, dtype=jnp.float32, cache_metadata=cache_metadata)
 
             data1 = jnp.array([[1.0, 2.0], [3.0, 4.0]])
             data2 = jnp.array([[5.0]])
@@ -31,9 +32,10 @@ def test_append_and_get(self):
             # result_slice = builder[0:2]
             # assert isinstance(result_slice, JaggedArray)
 
-    def test_extend_with_multiple(self):
+    @pytest.mark.parametrize("cache_metadata", [True, False])
+    def test_extend_with_multiple(self, cache_metadata):
         with tempfile.TemporaryDirectory() as tmpdir:
-            builder = JaggedArrayStore.open(tmpdir, item_rank=2, dtype=jnp.float32)
+            builder = JaggedArrayStore.open(tmpdir, item_rank=2, dtype=jnp.float32, cache_metadata=cache_metadata)
 
             data1 = jnp.array([[1.0, 2.0], [3.0, 4.0]])
             data2 = jnp.array([[5.0]])
@@ -54,9 +56,10 @@ def test_append_error(self):
             with pytest.raises(ValueError):
                 builder.append(jnp.array([[1.0, 2.0]]))
 
-    def test_append_single_rank(self):
+    @pytest.mark.parametrize("cache_metadata", [True, False])
+    def test_append_single_rank(self, cache_metadata):
         with tempfile.TemporaryDirectory() as tmpdir:
-            builder = JaggedArrayStore.open(tmpdir, item_rank=1, dtype=jnp.float32)
+            builder = JaggedArrayStore.open(tmpdir, item_rank=1, dtype=jnp.float32, cache_metadata=cache_metadata)
 
             data = jnp.array([1.0, 2.0, 3.0])
             builder.append(data)
@@ -66,9 +69,10 @@ def test_append_single_rank(self):
             result = builder[0]
             assert jnp.all(result == data)
 
-    def test_append_multi_rank(self):
+    @pytest.mark.parametrize("cache_metadata", [True, False])
+    def test_append_multi_rank(self, cache_metadata):
         with tempfile.TemporaryDirectory() as tmpdir:
-            builder = JaggedArrayStore.open(tmpdir, item_rank=2, dtype=jnp.float32)
+            builder = JaggedArrayStore.open(tmpdir, item_rank=2, dtype=jnp.float32, cache_metadata=cache_metadata)
 
             data1 = jnp.array([[1.0, 2.0], [3.0, 4.0]])
             data2 = jnp.array([[5.0, 6.0], [7.0, 8.0]])
@@ -105,14 +109,18 @@ def test_step_slicing(self):
             #     builder[::2]
 
 
-async def create_builder_with_data(directory, num_sequences: int, sequence_length: int | tuple[int, ...]):
+async def create_builder_with_data(
+    directory, num_sequences: int, sequence_length: int | tuple[int, ...], cache_metadata: bool = True
+) -> JaggedArrayStore:
     if isinstance(sequence_length, int):
         sequence_length = (sequence_length,)
 
     """Helper function to create a JaggedArrayStore with specific data."""
     seed = jax.random.PRNGKey(num_sequences * math.prod(sequence_length))
 
-    builder = await JaggedArrayStore.open_async(directory, item_rank=len(sequence_length), dtype=jnp.int64)
+    builder = await JaggedArrayStore.open_async(
+        directory, item_rank=len(sequence_length), dtype=jnp.int64, cache_metadata=cache_metadata
+    )
     for i in range(num_sequences):
         key, seed = jax.random.split(seed)
         data = jax.random.randint(key, sequence_length, 0, 100)
@@ -122,7 +130,7 @@ async def create_builder_with_data(directory, num_sequences: int, sequence_lengt
 
 
 def create_builder_with_data_sync(
-    directory, num_sequences: int, sequence_length: int | tuple[int, ...]
+    directory, num_sequences: int, sequence_length: int | tuple[int, ...], cache_metadata: bool = True
 ) -> JaggedArrayStore:
     if isinstance(sequence_length, int):
         sequence_length = (sequence_length,)
@@ -130,7 +138,9 @@ def create_builder_with_data_sync(
     """Helper function to create a JaggedArrayStore with specific data."""
     seed = jax.random.PRNGKey(num_sequences * math.prod(sequence_length))
 
-    builder = JaggedArrayStore.open(directory, item_rank=len(sequence_length), dtype=jnp.int64)
+    builder = JaggedArrayStore.open(
+        directory, item_rank=len(sequence_length), dtype=jnp.int64, cache_metadata=cache_metadata
+    )
     for i in range(num_sequences):
         key, seed = jax.random.split(seed)
         data = jax.random.randint(key, sequence_length, 0, 100)
@@ -190,9 +200,12 @@ async def test_trim_to_size_larger_than_current():
 
 
 @pytest.mark.asyncio
-async def test_trim_to_size_with_shapes_async():
+@pytest.mark.parametrize("cache_metadata", [True, False])
+async def test_trim_to_size_with_shapes_async(cache_metadata):
     tmpdir = tempfile.TemporaryDirectory().name
-    builder = await create_builder_with_data(tmpdir, num_sequences=10, sequence_length=(10, 100))
+    builder = await create_builder_with_data(
+        tmpdir, num_sequences=10, sequence_length=(10, 100), cache_metadata=cache_metadata
+    )
     expected_shapes = list(await builder.shapes[0:10].read())
 
     # Trim to smaller size
@@ -205,9 +218,12 @@ async def test_trim_to_size_with_shapes_async():
     assert np.array_equal(trimmed_shapes, jnp.stack(expected_shapes[:5]))
 
 
-def test_trim_to_size():
+@pytest.mark.parametrize("cache_metadata", [True, False])
+def test_trim_to_size_sync(cache_metadata):
     tmpdir = tempfile.TemporaryDirectory().name
-    builder = create_builder_with_data_sync(tmpdir, num_sequences=10, sequence_length=1000)
+    builder = create_builder_with_data_sync(
+        tmpdir, num_sequences=10, sequence_length=1000, cache_metadata=cache_metadata
+    )
 
     # Initial size
     initial_size = len(builder)
diff --git a/tests/test_new_cache.py b/tests/test_new_cache.py
index b6132e548..af6fa885f 100644
--- a/tests/test_new_cache.py
+++ b/tests/test_new_cache.py
@@ -1,42 +1,43 @@
 import asyncio
+import copy
 import logging
+import os
 import tempfile
-from typing import Iterator, Sequence
-from unittest.mock import MagicMock
+from typing import Any, Dict, Iterator, Sequence
 
 import numpy as np
 import pytest
 import ray
-from ray.exceptions import RayTaskError
 
 from levanter.data import BatchProcessor, ShardedDataSource, batched
 from levanter.data.sharded_datasource import TextUrlDataSource
 from levanter.store.cache import (
+    LEDGER_FILE_NAME,
+    CacheLedger,
+    CacheOptions,
     SerialCacheWriter,
+    ShardedCacheWriter,
     TreeStore,
     _get_builder_actor,
-    _OrderedCacheWriter,
+    _serialize_json_and_commit,
     build_or_load_cache,
 )
 from levanter.utils.py_utils import logical_cpu_core_count
-from levanter.utils.ray_utils import ExceptionInfo, SnitchRecipient, ser_exc_info
+from levanter.utils.ray_utils import ExceptionInfo, SnitchRecipient
 
 
 class TestProcessor(BatchProcessor[Sequence[int], dict[str, np.ndarray]]):
-    def __init__(self, batch_size: int = 8):
-        self._batch_size = batch_size
-
     def __call__(self, batch: Sequence[Sequence[int]]) -> Sequence[dict[str, np.ndarray]]:
         # return pa.RecordBatch.from_arrays([pa.array(batch)], ["test"])
         return [{"test": np.asarray(x)} for x in batch]
 
     @property
-    def output_exemplar(self):
-        return {"test": np.array([0], dtype=np.int64)}
+    def metadata(self) -> Dict[str, Any]:
+        return {}
 
     @property
-    def batch_size(self) -> int:
-        return self._batch_size
+    def output_exemplar(self):
+        return {"test": np.array([0], dtype=np.int64)}
 
     @property
     def num_cpus(self) -> int:
@@ -52,8 +53,7 @@ def simple_process(processor, source):
     return result
 
 
-def process_interleave(processor, source):
-    batch_size = processor.batch_size
+def process_interleave(processor, source, batch_size):
     shard_iterators = {
         shard_name: batched(iter(source.open_shard(shard_name)), batch_size) for shard_name in source.shard_names
     }
@@ -82,16 +82,9 @@ def teardown_module(module):
 
 
 class SimpleProcessor(BatchProcessor[Sequence[int], dict[str, np.ndarray]]):
-    def __init__(self, batch_size: int = 8):
-        self._batch_size = batch_size
-
     def __call__(self, batch: Sequence[Sequence[int]]) -> Sequence[dict[str, Sequence[int]]]:
         return [{"data": x} for x in batch]
 
-    @property
-    def batch_size(self) -> int:
-        return self._batch_size
-
     @property
     def num_cpus(self) -> int:
         return 1
@@ -100,6 +93,10 @@ def num_cpus(self) -> int:
     def output_exemplar(self) -> dict[str, np.ndarray]:
         return {"data": np.array([0], dtype=np.int64)}
 
+    @property
+    def metadata(self) -> Dict[str, Any]:
+        return {}
+
 
 class SimpleShardSource(ShardedDataSource[list[int]]):
     def __init__(self, num_shards: int = 4):
@@ -124,7 +121,7 @@ def test_serial_cache_writer():
 
         with SerialCacheWriter(tmpdir1, exemplar) as writer:
             for shard_name in source.shard_names:
-                for ex in batched(source.open_shard(shard_name), processor.batch_size):
+                for ex in batched(source.open_shard(shard_name), 32):
                     writer.write_batch(processor(ex))
 
         _ = writer.result()
@@ -181,7 +178,7 @@ def shard_finished(self, shard_name):
     def get_finished_shards(self):
         return self._finished_shards
 
-    def _updated_ledger(self, ledger):
+    def _notify_updated_ledger(self, ledger):
         if ledger.is_finished:
             self._finished = True
 
@@ -193,421 +190,56 @@ def _finalize(self):
     def is_finished(self):
         return self._finished
 
-    def signal_backpressure(self, desired_next_item: float):
-        self._desired_next_item = desired_next_item
-
-    def desired_next_item(self):
-        return self._desired_next_item
-
-
-@pytest.mark.asyncio
-async def test_batch_finished():
-    parent = PretendParent.remote()
-    exemplar = np.array([1, 2, 3])
-    with tempfile.TemporaryDirectory() as cache_dir:
-        shards = ["shard1", "shard2", "shard3"]
-        writer = _OrderedCacheWriter.remote(
-            parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards, min_items_to_write=1
-        )
-
-        try:
-            shard_idx = "shard1"
-            shard_batch_idx = 0
-            batch_result = [np.array([1, 2, 3])]
-
-            await writer.batch_finished.remote(shard_idx, shard_batch_idx, batch_result)
-            await writer.flush.remote()
-            shard_status = await writer.get_shard_status.remote("shard1")
-            assert shard_status.num_rows_committed == 1
-        finally:
-            ray.kill(parent)
-            ray.kill(writer)
-
-
-@pytest.mark.asyncio
-async def test_shard_finished_reading():
-    parent = PretendParent.remote()
-    exemplar = MagicMock()
-    with tempfile.TemporaryDirectory() as cache_dir:
-        shards = ["shard1", "shard2", "shard3"]
-        writer = _OrderedCacheWriter.remote(parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards)
-
-        try:
-            shard_name = "shard1"
-            expected_batches = 5
-
-            await writer.shard_finished_reading.remote(shard_name, expected_batches)
-            shard_status = await writer.get_shard_status.remote(shard_name)
-            assert shard_status.is_finished is False
-        finally:
-            ray.kill(parent)
-            ray.kill(writer)
-
-
-@pytest.mark.asyncio
-async def test_get_shard_status():
-    parent = PretendParent.remote()
-    exemplar = np.array([1, 2, 3])
-    with tempfile.TemporaryDirectory() as cache_dir:
-        shards = ["shard1", "shard2", "shard3"]
-        writer = _OrderedCacheWriter.remote(parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards)
-
-        try:
-            shard_name = "shard1"
-            shard_status = await writer.get_shard_status.remote(shard_name)
-
-            assert shard_status.shard_name == shard_name
-            assert shard_status.num_rows_committed == 0
-            assert not shard_status.is_finished
-        finally:
-            ray.kill(parent)
-            ray.kill(writer)
-
-
-@pytest.mark.asyncio
-async def test_shard_failed():
-    parent = PretendParent.remote()
-    exemplar = MagicMock()
-    with tempfile.TemporaryDirectory() as cache_dir:
-        shards = ["shard1", "shard2", "shard3"]
-        writer = _OrderedCacheWriter.remote(parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards)
-
-        try:
-            shard_name = "shard1"
-            batch_id = 0
-            try:
-                raise Exception("Test Exception")
-            except:  # noqa
-                exc_info = ser_exc_info()
-
-            await writer.shard_failed.remote(shard_name, batch_id, exc_info)
-            exception_received = await parent.wait_for_failure.remote()
-            assert str(exception_received.ex) == str(exc_info.ex)
-        finally:
-            ray.kill(parent)
-            ray.kill(writer)
-
-
-DEFAULT_BATCH_SIZE = 128
-
-
-@pytest.mark.asyncio
-async def test_attempt_to_write_batches():
-    parent = PretendParent.remote()
-    exemplar = np.array([1, 2, 3])
-    with tempfile.TemporaryDirectory() as cache_dir:
-        shards = ["shard1", "shard2", "shard3"]
-        writer = _OrderedCacheWriter.remote(
-            parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards, min_items_to_write=2
-        )
-
-        try:
-            shard1_batch = [np.asarray([1, 2, 3])]
-            shard2_batch = [np.asarray([4, 5, 6, 7])]
-
-            await writer.batch_finished.remote("shard1", 0, shard1_batch)
-            await writer.batch_finished.remote("shard2", 0, shard2_batch)
-
-            await writer.flush.remote()
-
-            ledger = await writer.get_ledger.remote()
-            assert ledger.is_finished is False
-            assert ledger.total_num_rows == 2  # Assuming each batch has 1 row for simplicity
-
-            store = TreeStore.open(exemplar, cache_dir, mode="r")
-            assert len(store) == 2
-            np.testing.assert_array_equal(store[0], shard1_batch[0])
-            np.testing.assert_array_equal(store[1], shard2_batch[0])
-        finally:
-            ray.kill(parent)
-            ray.kill(writer)
-
-
-@pytest.mark.asyncio
-async def test_finalize_cache():
-    parent = PretendParent.remote()
-    exemplar = np.array([1, 2, 3])
-    with tempfile.TemporaryDirectory() as cache_dir:
-        shards = ["shard1", "shard2", "shard3"]
-        writer = _OrderedCacheWriter.remote(parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards)
-
-        try:
-            shard1_batch = [np.array([1, 2, 3])]
-            shard2_batch = [np.array([4, 5, 6, 7])]
-
-            await writer.batch_finished.remote("shard1", 0, shard1_batch)
-            await writer.shard_finished_reading.remote("shard1", 1)
-            await writer.shard_finished_reading.remote("shard2", 1)
-            await writer.batch_finished.remote("shard2", 0, shard2_batch)
-            await writer.flush.remote()
-
-            ledger = await writer.get_ledger.remote()
-            assert ledger.is_finished is False
-            assert ledger.total_num_rows == 2  # Assuming each batch has 1 row for simplicity
-
-            await writer.shard_finished_reading.remote("shard3", 0)
-            finished_shards = await parent.get_finished_shards.remote()
-            assert len(finished_shards) == 3
-
-            ledger = await writer.get_ledger.remote()
-            assert ledger.is_finished is True
-            assert ledger.total_num_rows == 2
-            assert await parent.is_finished.remote() is True
-        finally:
-            ray.kill(parent)
-            ray.kill(writer)
-
-
-@pytest.mark.asyncio
-async def test_error_handling():
-    parent = PretendParent.remote()
-    exemplar = np.array([1, 2, 3])
-    with tempfile.TemporaryDirectory() as cache_dir:
-        shards = ["shard1", "shard2", "shard3"]
-        writer = _OrderedCacheWriter.remote(parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards)
-
-        try:
-            with pytest.raises(TypeError):
-                await writer.batch_finished.remote("shard1", 0, None)
-
-            exception_received = await parent.wait_for_failure.remote()
-            assert exception_received is not None
-        finally:
-            ray.kill(parent)
-            ray.kill(writer)
-
-
-@pytest.mark.asyncio
-async def test_out_of_order_batches_same_shard():
-    parent = PretendParent.remote()
-    exemplar = np.array([1, 2, 3])
-    with tempfile.TemporaryDirectory() as cache_dir:
-        shards = ["shard1"]
-        writer = _OrderedCacheWriter.remote(
-            parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards, min_items_to_write=2
-        )
-
-        try:
-            # Sending batch 1 before batch 0 for shard1
-            shard1_batch0 = [np.array([1, 2, 3])]
-            shard1_batch1 = [np.array([4, 5, 6])]
-
-            await writer.batch_finished.remote("shard1", 1, shard1_batch1)
-            await writer.batch_finished.remote("shard1", 0, shard1_batch0)
-            await writer.flush.remote()
-
-            store = TreeStore.open(exemplar, cache_dir, mode="r")
-            assert len(store) == 2
-            np.testing.assert_array_equal(store[0], shard1_batch0[0])
-            np.testing.assert_array_equal(store[1], shard1_batch1[0])
-        finally:
-            ray.kill(parent)
-            ray.kill(writer)
-
-
-@pytest.mark.asyncio
-async def test_out_of_order_batches_different_shards():
-    parent = PretendParent.remote()
-    exemplar = np.array([1, 2, 3])
-    with tempfile.TemporaryDirectory() as cache_dir:
-        shards = ["shard1", "shard2"]
-        writer = _OrderedCacheWriter.remote(
-            parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards, min_items_to_write=3
-        )
-
-        try:
-            # Sending batches out of order across different shards
-            shard1_batch0 = [np.array([1, 2, 3])]
-            shard2_batch0 = [np.array([4, 5, 6])]
-            shard1_batch1 = [np.array([7, 8, 9])]
-
-            await writer.batch_finished.remote("shard1", 1, shard1_batch1)
-            await writer.batch_finished.remote("shard2", 0, shard2_batch0)
-            await writer.batch_finished.remote("shard1", 0, shard1_batch0)
-            await writer.flush.remote()
-
-            store = TreeStore.open(exemplar, cache_dir, mode="r")
-            assert len(store) == 3
-            np.testing.assert_array_equal(store[0], shard1_batch0[0])
-            np.testing.assert_array_equal(store[1], shard2_batch0[0])
-            np.testing.assert_array_equal(store[2], shard1_batch1[0])
-        finally:
-            ray.kill(parent)
-            ray.kill(writer)
-
-
-@pytest.mark.asyncio
-async def test_batches_different_orders_all_shards():
-    parent = PretendParent.remote()
-    exemplar = np.array([1, 2, 3])
-    with tempfile.TemporaryDirectory() as cache_dir:
-        shards = ["shard1", "shard2", "shard3"]
-        writer = _OrderedCacheWriter.remote(
-            parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards, min_items_to_write=2
-        )
-
-        try:
-            # Sending batches in different orders across all shards
-            shard1_batch0 = [np.array([1, 2, 3])]
-            shard1_batch1 = [np.array([4, 5, 6])]
-            shard2_batch0 = [np.array([7, 8, 9])]
-            shard3_batch0 = [np.array([10, 11, 12])]
-
-            await writer.batch_finished.remote("shard2", 0, shard2_batch0)
-            await writer.batch_finished.remote("shard3", 0, shard3_batch0)
-            await writer.batch_finished.remote("shard1", 1, shard1_batch1)
-            await writer.batch_finished.remote("shard1", 0, shard1_batch0)
-            await writer.flush.remote()
-
-            store = TreeStore.open(exemplar, cache_dir, mode="r")
-            assert len(store) == 4
-            np.testing.assert_array_equal(store[0], shard1_batch0[0])
-            np.testing.assert_array_equal(store[1], shard2_batch0[0])
-            np.testing.assert_array_equal(store[2], shard3_batch0[0])
-            np.testing.assert_array_equal(store[3], shard1_batch1[0])
-        finally:
-            ray.kill(parent)
-            ray.kill(writer)
-
 
-@pytest.mark.asyncio
-async def test_intermixed_batches_same_and_different_shards():
-    parent = PretendParent.remote()
-    exemplar = np.array([1, 2, 3])
-    with tempfile.TemporaryDirectory() as cache_dir:
-        shards = ["shard1", "shard2", "shard3"]
-        writer = _OrderedCacheWriter.remote(
-            parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards, min_items_to_write=1
+@pytest.mark.ray
+def test_full_end_to_end_cache():
+    td = tempfile.TemporaryDirectory()
+    with td as tmpdir:
+        ray_ds = build_or_load_cache(
+            tmpdir,
+            SimpleShardSource(num_shards=2),
+            TestProcessor(),
+            await_finished=True,
+            options=CacheOptions.no_fanciness(8),
         )
 
-        try:
-            # Sending intermixed batches from the same and different shards
-            shard1_batch0 = [np.array([1, 2, 3])]
-            shard2_batch0 = [np.array([4, 5, 6])]
-            shard1_batch1 = [np.array([7, 8, 9])]
-            shard3_batch0 = [np.array([10, 11, 12])]
-            shard2_batch1 = [np.array([13, 14, 15])]
-
-            await writer.batch_finished.remote("shard2", 0, shard2_batch0)
-            await writer.batch_finished.remote("shard3", 0, shard3_batch0)
-            await writer.batch_finished.remote("shard1", 1, shard1_batch1)
-            await writer.batch_finished.remote("shard2", 1, shard2_batch1)
-            await writer.batch_finished.remote("shard1", 0, shard1_batch0)
-            await writer.flush.remote()
-
-            store = TreeStore.open(exemplar, cache_dir, mode="r")
-            assert len(store) == 5
-            np.testing.assert_array_equal(store[0], shard1_batch0[0])
-            np.testing.assert_array_equal(store[1], shard2_batch0[0])
-            np.testing.assert_array_equal(store[2], shard3_batch0[0])
-            np.testing.assert_array_equal(store[3], shard1_batch1[0])
-            np.testing.assert_array_equal(store[4], shard2_batch1[0])
-        finally:
-            ray.kill(parent)
-            ray.kill(writer)
-
+        expected = process_interleave(TestProcessor(), SimpleShardSource(num_shards=2), 8)
 
-@pytest.mark.asyncio
-async def test_duplicate_batches_same_shard():
-    parent = PretendParent.remote()
-    exemplar = np.array([1, 2, 3])
-    with tempfile.TemporaryDirectory() as cache_dir:
-        shards = ["shard1"]
-        writer = _OrderedCacheWriter.remote(parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards)
-
-        try:
-            # Sending duplicate batches for the same shard
-            shard1_batch0 = [np.array([1, 2, 3])]
-
-            await writer.batch_finished.remote("shard1", 0, shard1_batch0)
-            await writer.flush.remote()
-            with pytest.raises(RayTaskError):
-                await writer.batch_finished.remote("shard1", 0, shard1_batch0)  # Duplicate
-        finally:
-            ray.kill(parent)
-            ray.kill(writer)
-
-
-@pytest.mark.asyncio
-async def test_mixed_order_batches_multiple_shards():
-    parent = PretendParent.remote()
-    exemplar = np.array([1, 2, 3])
-    with tempfile.TemporaryDirectory() as cache_dir:
-        shards = ["shard1", "shard2", "shard3"]
-        writer = _OrderedCacheWriter.remote(
-            parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards, min_items_to_write=1
-        )
+        all_data = ray_ds[:]
 
-        try:
-            # Sending batches in mixed order for multiple shards
-            shard1_batch0 = [np.array([1, 2, 3])]
-            shard2_batch0 = [np.array([4, 5, 6])]
-            shard1_batch1 = [np.array([7, 8, 9])]
-            shard2_batch1 = [np.array([10, 11, 12])]
-            shard3_batch0 = [np.array([13, 14, 15])]
-            shard3_batch1 = [np.array([16, 17, 18])]
-
-            await writer.batch_finished.remote("shard3", 0, shard3_batch0)
-            await writer.batch_finished.remote("shard1", 1, shard1_batch1)
-            await writer.batch_finished.remote("shard2", 0, shard2_batch0)
-            await writer.batch_finished.remote("shard2", 1, shard2_batch1)
-            await writer.batch_finished.remote("shard1", 0, shard1_batch0)
-            await writer.batch_finished.remote("shard3", 1, shard3_batch1)
-            await writer.flush.remote()
-
-            store = TreeStore.open(exemplar, cache_dir, mode="r")
-            assert len(store) == 6
-            np.testing.assert_array_equal(store[0], shard1_batch0[0])
-            np.testing.assert_array_equal(store[1], shard2_batch0[0])
-            np.testing.assert_array_equal(store[2], shard3_batch0[0])
-            np.testing.assert_array_equal(store[3], shard1_batch1[0])
-            np.testing.assert_array_equal(store[4], shard2_batch1[0])
-            np.testing.assert_array_equal(store[5], shard3_batch1[0])
-        finally:
-            ray.kill(parent)
-            ray.kill(writer)
+        check_datasets_equal(all_data, expected)
 
 
 @pytest.mark.ray
-def test_full_end_to_end_cache_simple():
+def test_full_end_to_end_cache_with_groups():
     td = tempfile.TemporaryDirectory()
     with td as tmpdir:
         ray_ds = build_or_load_cache(
             tmpdir,
-            SimpleShardSource(num_shards=1),
+            SimpleShardSource(num_shards=5),
             TestProcessor(),
             await_finished=True,
+            options=CacheOptions(num_shard_groups=2, batch_size=8, shard_order_randomization_key=None),
         )
 
-        simple_processed = simple_process(TestProcessor(), SimpleShardSource())
+        expected = process_interleave(TestProcessor(), SimpleShardSource(num_shards=5), 8)
 
         all_data = ray_ds[:]
 
-        check_datasets_equal(all_data, simple_processed)
+        # check_datasets_equal(all_data, expected)
+        assert len(all_data) == len(list(expected))
 
 
 @pytest.mark.ray
 def test_cache_remembers_its_cached():
     directory = tempfile.TemporaryDirectory()
     with directory as tmpdir:
-        ds1 = build_or_load_cache(tmpdir, SimpleShardSource(), TestProcessor())
+        ds1 = build_or_load_cache(tmpdir, SimpleShardSource(), TestProcessor(), await_finished=True)
 
-        class ThrowingProcessor(BatchProcessor[Sequence[int], dict[str, np.ndarray]]):
+        class ThrowingProcessor(TestProcessor):
             def __call__(self, batch: Sequence[Sequence[int]]):
                 raise RuntimeError("This should not be called")
 
-            @property
-            def output_exemplar(self) -> dict[str, np.ndarray]:
-                return {"test": np.array([0], dtype=np.int64)}
-
-            @property
-            def batch_size(self) -> int:
-                return 8
-
-            @property
-            def num_cpus(self) -> int:
-                return 1
-
         # testing this doesn't throw
         ds2 = build_or_load_cache(tmpdir, SimpleShardSource(), ThrowingProcessor(), await_finished=True)
 
@@ -615,6 +247,9 @@ def num_cpus(self) -> int:
 
 
 def check_datasets_equal(ds1, ds2):
+    ds1 = list(ds1)
+    ds2 = list(ds2)
+    assert len(ds1) == len(ds2)
     for r1, r2 in zip(ds1, ds2):
         assert r1.keys() == r2.keys()
         for key in r1.keys():
@@ -672,7 +307,6 @@ def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[list[int]]:
         # compare to the original with no crash
         reader2 = build_or_load_cache(tmpdir2, SimpleShardSource(), TestProcessor(), await_finished=True)
 
-        assert len(list(reader1)) == 40
         check_datasets_equal(reader1, reader2)
 
 
@@ -699,23 +333,26 @@ def shard_names(self) -> Sequence[str]:
             return ["shard_0", "shard_1"]
 
         def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[list[int]]:
+            assert shard_name in self.shard_names
             max_count = 40 if shard_name == "shard_1" else 20
             shard_id = int(shard_name.split("_")[1])
             for i in range(0, max_count):
                 yield [i * 10 + shard_id] * 10
 
     with tempfile.TemporaryDirectory() as tmpdir:
+        processor = TestProcessor()
         cache = build_or_load_cache(
             tmpdir,
             SlowShardSource(),
-            TestProcessor(1),
+            processor,
             await_finished=False,
+            options=CacheOptions.no_fanciness(16),
         )
 
         # now block until the cache is done
-        cache.await_finished(timeout=10)
+        cache.await_finished(timeout=30)
 
-        expected = process_interleave(TestProcessor(1), SlowShardSource())
+        expected = process_interleave(processor, SlowShardSource(), 16)
 
         check_datasets_equal(list(cache[:]), expected)
 
@@ -750,8 +387,13 @@ def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[list[int]]:
 
     with tempfile.TemporaryDirectory() as tmpdir:
         cache = build_or_load_cache(
-            tmpdir, SlowShardSource(), TestProcessor(5), await_finished=False, items_per_write=5
-        )
+            tmpdir,
+            SlowShardSource(),
+            TestProcessor(),
+            await_finished=False,
+            force_flush=True,
+            options=CacheOptions.no_fanciness(5),
+        )  # we need force_flush to ensure the cache is written to disk
 
         # read the first 10 elements
         # ensure the first 10 elements are [{"test": np.array([i] * 10)} for i in range(10)]
@@ -782,22 +424,10 @@ def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[list[int]]:
 @pytest.mark.skip("This test segfaults in CI. I think a ray bug")
 @pytest.mark.ray
 def test_shard_cache_crashes_if_processor_throws():
-    class ThrowingProcessor(BatchProcessor[Sequence[int], dict[str, np.ndarray]]):
+    class ThrowingProcessor(SimpleProcessor):
         def __call__(self, batch: Sequence[Sequence[int]]):
             raise RuntimeError("exc")
 
-        @property
-        def output_exemplar(self) -> dict:
-            return {"test": np.array([0], dtype=np.int64)}
-
-        @property
-        def batch_size(self) -> int:
-            return 8
-
-        @property
-        def num_cpus(self) -> int:
-            return 1
-
     with tempfile.TemporaryDirectory() as tmpdir:
         with pytest.raises(RuntimeError):
             build_or_load_cache(tmpdir, SimpleShardSource(), ThrowingProcessor(), await_finished=True)
@@ -880,60 +510,81 @@ def test_shard_cache_fails_gracefully_with_unknown_file_type():
         del cache
 
 
-@pytest.mark.ray
-@pytest.mark.asyncio
-async def test_backpressure_mechanism():
-    parent = PretendParent.remote()
-    exemplar = np.array([1, 2, 3])
-    with tempfile.TemporaryDirectory() as cache_dir:
-        shards = ["shard1", "shard2", "shard3"]
-        writer = _OrderedCacheWriter.remote(
-            parent, "test", exemplar, DEFAULT_BATCH_SIZE, cache_dir, shards, min_items_to_write=1
-        )
+def test_sharded_cache_writer():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        source = SimpleShardSource(num_shards=4)
+        processor = SimpleProcessor()
+        ledger = CacheLedger.load_or_initialize(tmpdir, source, processor, CacheOptions.no_fanciness(8))
+
+        exemplar = {"data": np.array([0], dtype=np.int64)}
+
+        writer = ShardedCacheWriter(tmpdir, ledger, exemplar)
+        for shard_name in source.shard_names:
+            for ex in batched(source.open_shard(shard_name), ledger.metadata.options.batch_size):
+                writer.write_batch(shard_name, processor(ex))
+
+        store = writer.finish()
+
+        data_path = store.path
+
+        del store
+
+        builder = TreeStore.open(exemplar, data_path, mode="r")
+
+        assert len(builder) == 40
+
+        for i, x in enumerate(builder):
+            np.testing.assert_array_equal(x["data"], np.asarray([i % 10 + i // 10 * 10] * 10))
+
+        # check totals for the ledger
+        ledger = writer.ledger
+        assert ledger.total_num_rows == 40
+        assert ledger.is_finished
+
+        for shard_name in source.shard_names:
+            assert ledger.shard_rows[shard_name] == 10
+
+
+def test_sharded_cache_writer_trims_on_resume():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        source = SimpleShardSource(num_shards=4)
+        processor = SimpleProcessor()
+
+        exemplar = {"data": np.array([0], dtype=np.int64)}
+
+        ledger = CacheLedger.load_or_initialize(tmpdir, source, processor, CacheOptions.no_fanciness(batch_size=8))
+
+        writer = ShardedCacheWriter(tmpdir, ledger, exemplar)
+        for shard_name in source.shard_names:
+            for ex in batched(source.open_shard(shard_name), 8):
+                writer.write_batch(shard_name, processor(ex))
+
+        writer.finish()
+
+        # now deliberately truncate the ledger a bit
+        ledger = copy.deepcopy(writer.ledger)
+        assert ledger.total_num_rows == 40
+        assert ledger.is_finished
+        ledger.total_num_rows = 24
+        ledger.shard_rows["shard_0"] = 8
+        ledger.shard_rows["shard_1"] = 8
+        ledger.shard_rows["shard_2"] = 8
+        ledger.shard_rows["shard_3"] = 0
+        ledger.is_finished = False
+
+        _serialize_json_and_commit(os.path.join(tmpdir, LEDGER_FILE_NAME), ledger)
+
+        writer = ShardedCacheWriter(tmpdir, ledger, exemplar)
+
+        # ensure it got truncated
+        assert writer.ledger.total_num_rows == 24
+        assert writer.ledger.is_finished is False
+        assert writer.ledger.shard_rows["shard_0"] == 8
+        assert writer.ledger.shard_rows["shard_1"] == 8
+        assert writer.ledger.shard_rows["shard_2"] == 8
+        assert writer.ledger.shard_rows["shard_3"] == 0
+
+        new_store = writer._tree_store
+        new_data = new_store[:]
 
-        # Simulate batches being processed
-        shard1_batch = [np.array([1, 2, 3])]
-        shard2_batch = [np.array([4, 5, 6])]
-        shard3_batch = [np.array([7, 8, 9])]
-
-        # await writer.batch_finished.remote("shard1", 0, shard1_batch)
-        await writer.batch_finished.remote("shard2", 0, shard2_batch)
-        await writer.batch_finished.remote("shard3", 0, shard3_batch)
-        await writer.batch_finished.remote("shard1", 1, shard3_batch)
-        await writer.batch_finished.remote("shard1", 2, shard3_batch)
-        await writer.batch_finished.remote("shard1", 3, shard3_batch)
-        await writer.flush.remote()
-
-        # Check if backpressure is signaled
-        is_overwhelmed = await writer.is_overwhelmed.remote()
-        assert is_overwhelmed is True
-        await writer.flush.remote()
-
-        for i in range(4):
-            if (await parent.desired_next_item.remote()) == 0:
-                break
-
-            await asyncio.sleep(0.1 * (i + 1) * (i + 1))
-        else:
-            assert False, "Backpressure wasn't sent"
-
-        await writer.batch_finished.remote("shard1", 0, shard1_batch)
-
-        # Reduce the queue size to relieve backpressure
-        # Check if backpressure is relieved
-        is_overwhelmed = await writer.is_overwhelmed.remote()
-        count = 0
-        while is_overwhelmed and count < 10:
-            await writer.flush.remote()
-            await asyncio.sleep(0.4)
-            is_overwhelmed = await writer.is_overwhelmed.remote()
-            count += 1
-        assert is_overwhelmed is False
-
-        for i in range(4):
-            if (await parent.desired_next_item.remote()) is None:
-                break
-
-            await asyncio.sleep(0.1 * (i + 1) * (i + 1))
-        else:
-            assert False, "Backpressure wasn't relieved"
+        assert len(new_data) == 24
diff --git a/tests/test_prefetch_actor.py b/tests/test_prefetch_actor.py
new file mode 100644
index 000000000..e48546fc1
--- /dev/null
+++ b/tests/test_prefetch_actor.py
@@ -0,0 +1,137 @@
+import time
+from typing import Iterator
+
+import pytest
+import ray
+
+from levanter.store._prefetch_actor import RayPrefetchQueue
+
+
+def _sleep_until(condition, timeout=5, message="Condition not met within timeout"):
+    start = time.time()
+    while not condition():
+        if time.time() - start > timeout:
+            pytest.fail(message)
+        time.sleep(0.1)
+
+
+@pytest.fixture(scope="module", autouse=True)
+def ray_init_and_shutdown():
+    ray.init("local", ignore_reinit_error=True)
+    yield
+    ray.shutdown()
+
+
+@pytest.mark.ray
+def test_initialization_and_basic_functionality():
+    def simple_producer():
+        for i in range(10):
+            yield i
+
+    actor = RayPrefetchQueue(simple_producer)
+    results = [actor.get_next() for _ in range(10)]
+    assert results == list(range(10))
+
+
+@pytest.mark.ray
+def test_queue_size_limit():
+    def simple_producer() -> Iterator[ray.ObjectRef]:
+        for i in range(100):
+            yield i
+
+    actor = RayPrefetchQueue(simple_producer, max_queue_size=10)
+    # Allow some time for the queue to fill up
+    _sleep_until(lambda: actor.queue_size() == 10)
+
+    # get a few items to make some space
+    [actor.get_next() for _ in range(5)]
+    _sleep_until(lambda: actor.queue_size() == 10, message="Queue size did not reach 10")
+
+
+@pytest.mark.ray
+def test_stop_functionality():
+    def simple_producer():
+        for i in range(10000):
+            yield i
+
+    actor = RayPrefetchQueue(simple_producer)
+    actor.stop()
+
+    _sleep_until(lambda: actor.is_stopped(), message="Actor did not stop")
+
+
+@pytest.mark.ray
+def test_exception_handling():
+    def faulty_producer():
+        for i in range(5):
+            yield i
+        raise ValueError("Test exception")
+
+    actor = RayPrefetchQueue(faulty_producer)
+    results = []
+    try:
+        for _ in range(10):
+            results.append(actor.get_next())
+    except ValueError as e:
+        assert "Test exception" in str(e)  # Ray puts a lot of crap in the exception message
+    assert results == list(range(5))
+
+
+@pytest.mark.ray
+def test_empty_producer():
+    def empty_producer() -> Iterator[ray.ObjectRef]:
+        if False:
+            yield
+
+    actor = RayPrefetchQueue(empty_producer)
+    with pytest.raises(StopIteration):
+        actor.get_next()
+
+
+@pytest.mark.ray
+def test_multiple_consumers():
+    def simple_producer() -> Iterator[ray.ObjectRef]:
+        for i in range(20):
+            yield i
+
+    actor = RayPrefetchQueue(simple_producer)
+    results = [actor.get_next() for _ in range(10)]
+    results += [actor.get_next() for _ in range(10)]
+    assert results == list(range(20))
+
+
+@pytest.mark.ray
+def test_producer_completion():
+    def simple_producer():
+        for i in range(10):
+            yield i
+
+    actor = RayPrefetchQueue(simple_producer)
+    results = []
+    try:
+        while True:
+            results.append(actor.get_next())
+    except StopIteration:
+        pass
+    assert results == list(range(10))
+
+
+@pytest.mark.ray
+def test_drain_queue():
+    def simple_producer():
+        for i in range(10):
+            yield i
+
+    actor = RayPrefetchQueue(simple_producer)
+
+    all_results = []
+
+    for tot in range(0, 5):
+        out = actor.drain_available(tot)
+        assert len(out) <= tot
+        all_results.extend(out)
+
+    while len(all_results) < 10:
+        all_results.append(actor.get_next())
+
+    assert all_results == list(range(10))
diff --git a/tests/test_tree_store.py b/tests/test_tree_store.py
index e25ef7928..66131ca48 100644
--- a/tests/test_tree_store.py
+++ b/tests/test_tree_store.py
@@ -1,5 +1,5 @@
 import tempfile
-from typing import Iterator, List, Sequence
+from typing import Any, Dict, Iterator, List, Sequence
 
 import numpy as np
 import pytest
@@ -11,9 +11,6 @@
 
 
 class SimpleProcessor(BatchProcessor[Sequence[int], dict[str, np.ndarray]]):
-    def __init__(self, batch_size: int = 8):
-        self._batch_size = batch_size
-
     def __call__(self, batch: Sequence[Sequence[int]]) -> Sequence[dict[str, Sequence[int]]]:
         return [{"data": x} for x in batch]
 
@@ -21,14 +18,14 @@ def __call__(self, batch: Sequence[Sequence[int]]) -> Sequence[dict[str, Sequenc
     def output_exemplar(self) -> dict[str, Sequence[int]]:
         return {"data": np.array([0], dtype=np.int64)}
 
-    @property
-    def batch_size(self) -> int:
-        return self._batch_size
-
     @property
     def num_cpus(self) -> int:
         return 1
 
+    @property
+    def metadata(self) -> Dict[str, Any]:
+        return {}
+
 
 class SimpleShardSource(ShardedDataSource[List[int]]):
     def __init__(self, num_shards: int = 4):
@@ -52,7 +49,7 @@ def test_tree_builder_with_processor():
         processor = SimpleProcessor()
         source = SimpleShardSource()
 
-        for batch in batched(source, processor.batch_size):
+        for batch in batched(source, 8):
             processed = processor(batch)
             builder.extend(processed)
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 6206ec2ff..a86217549 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,7 +1,7 @@
 import glob
 import os
 from functools import reduce
-from typing import Callable, List, Optional, Sequence, TypeVar
+from typing import Any, Callable, Dict, List, Optional, Sequence, TypeVar
 
 import draccus
 import equinox as eqx
@@ -204,6 +204,10 @@ def output_exemplar(self):
     def num_cpus(self) -> int:
         return 0
 
+    @property
+    def metadata(self) -> Dict[str, Any]:
+        return {}
+
 
 class ShardsDataSource(ShardedDataSource[T]):
     def __init__(self, docs: List[List[T]]):

From 3bae9d3e81f72b145a7e7764926f4843e3bf6336 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Fri, 4 Oct 2024 23:20:40 -0500
Subject: [PATCH 76/94] allow mixture components to override cache_dir (#754)

---
 config/gpt2_nano_mixture.yaml      |  1 +
 src/levanter/data/text.py          | 25 +++++++++++++++++++------
 src/levanter/main/cache_dataset.py |  2 +-
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/config/gpt2_nano_mixture.yaml b/config/gpt2_nano_mixture.yaml
index 2939b9e5e..35b240787 100644
--- a/config/gpt2_nano_mixture.yaml
+++ b/config/gpt2_nano_mixture.yaml
@@ -5,6 +5,7 @@ data:
       id: dlwh/wikitext_103_detokenized
     w2:
       id: dlwh/wikitext_103_detokenized
+      cache_dir: wikitext2_cache
   train_weights:
     wikitext: 1.0
     w2: 1.0
diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
index 62dfb62ba..5e595b2a1 100644
--- a/src/levanter/data/text.py
+++ b/src/levanter/data/text.py
@@ -466,6 +466,7 @@ class LMDatasetSourceConfig:
 
     train_urls: List[str] = ()  # type: ignore
     validation_urls: List[str] = ()  # type:ignore
+    cache_dir: Optional[str] = None  # Optionally override the cache dir for this component
 
     def get_shard_source(self, split) -> Optional[ShardedDataSource[str]]:
         if self.id is not None:
@@ -530,7 +531,7 @@ class LMTaskConfig(abc.ABC):
     vocab_size: Optional[int] = None  # if using the passthrough tokenizer, this is required
 
     # config related to caching
-    cache_dir: str = "cache/"
+    cache_dir: Optional[str] = "cache/"
     cache_options: CacheOptions = field(default_factory=CacheOptions)
     enforce_eos: bool = True  # whether to append eos even if the tokenizer doesn't
 
@@ -560,7 +561,7 @@ def validation_sets(
 
     @property
     @abc.abstractmethod
-    def sources(self) -> dict[str, LMDatasetSourceConfig]:
+    def sources(self) -> Mapping[str, LMDatasetSourceConfig]:
         pass
 
     def tagged_eval_sets(
@@ -605,7 +606,7 @@ def validation_sets(
             return {}
 
     @property
-    def sources(self) -> dict[str, LMDatasetSourceConfig]:
+    def sources(self) -> Mapping[str, LMDatasetSourceConfig]:
         return {"": self}
 
     @cached_property
@@ -634,6 +635,9 @@ def token_seq_dataset(
     def build_or_load_cache(
         self, split: str, monitors: Union[bool, List[MetricsMonitor]] = True, logger_name: Optional[str] = None
     ) -> Optional[TreeCache[BatchEncoding]]:
+        if self.cache_dir is None:
+            raise ValueError("cache_dir cannot be None")
+
         split_cache_dir = os.path.join(self.cache_dir, split)
         name = logger_name or os.path.basename(self.cache_dir)
 
@@ -788,10 +792,19 @@ def build_caches(
             if weight == 0 and split == "train":
                 continue
 
-            source_config_dict = source_config.__dict__
+            source_config_dict = dict(**source_config.__dict__)
+
+            if source_config.cache_dir is None:
+                # replace with the main cache dir/{name}
+                if self.cache_dir is None:
+                    raise ValueError(
+                        "If the 'main' cache_dir is None, then all component cache_dirs must be non-None, but"
+                        f"{name}'s cache_dir is None."
+                    )
+                cache_dir = os.path.join(self.cache_dir, name)
+                source_config_dict["cache_dir"] = cache_dir
 
             dataset = LMDatasetConfig(
-                cache_dir=os.path.join(self.cache_dir, name),
                 **source_config_dict,
                 **task_config_dict,
             )
@@ -813,5 +826,5 @@ def build_caches(
         return caches
 
     @property
-    def sources(self) -> dict[str, LMDatasetSourceConfig]:
+    def sources(self) -> Mapping[str, LMDatasetSourceConfig]:
         return self.configs
diff --git a/src/levanter/main/cache_dataset.py b/src/levanter/main/cache_dataset.py
index 2483e9214..caccc567c 100644
--- a/src/levanter/main/cache_dataset.py
+++ b/src/levanter/main/cache_dataset.py
@@ -31,7 +31,7 @@ def main(args: RayCachedLMDatasetConfig):
         print(f"Caching {split} to {args.cache_dir}.")
         # connect or start the actor
         batch_tokenizer = BatchTokenizer(tokenizer, enforce_eos=args.enforce_eos)
-        split_cache_dir = os.path.join(args.cache_dir, split)
+        split_cache_dir = os.path.join(args.cache_dir, split)  # type: ignore
         source = args.get_shard_source(split)
 
         if source is None:

From 98477280870e42b3c0de3cc2d4ccbfbe622f1ce2 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Sat, 5 Oct 2024 12:18:51 -0500
Subject: [PATCH 77/94] a few final tweaks for marin runs (#755)

---
 docs/Configuration-Guide.md   | 1 -
 src/levanter/checkpoint.py    | 6 +++++-
 src/levanter/data/text.py     | 4 ++++
 src/levanter/main/train_lm.py | 6 +++++-
 src/levanter/trainer.py       | 1 -
 5 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/docs/Configuration-Guide.md b/docs/Configuration-Guide.md
index bdb09e4f1..d67997ea2 100644
--- a/docs/Configuration-Guide.md
+++ b/docs/Configuration-Guide.md
@@ -111,7 +111,6 @@ The following table lists some of the parameters that you might want to change.
 | Parameter      | Description                                                                   | Default |
 |----------------|-------------------------------------------------------------------------------|---------|
 | `log_dir`      | Where to save logs (python logger). `$run_id` will be appended                | `logs/` |
-| `run_base_dir` | where to save run artifacts. not really used much. `$run_id` will be appended | `runs/` |
 
 
 
diff --git a/src/levanter/checkpoint.py b/src/levanter/checkpoint.py
index b102198d7..5bfb6be30 100644
--- a/src/levanter/checkpoint.py
+++ b/src/levanter/checkpoint.py
@@ -549,8 +549,12 @@ class CheckpointerConfig:
         default_factory=lambda: [dict(every=10000)]
     )  # list of dicts with two keys: every and until
 
+    append_run_id_to_base_path: bool = True
+
     def expanded_path(self, run_id) -> str:
-        return os.path.expanduser(os.path.join(self.base_path, run_id))
+        if self.append_run_id_to_base_path:
+            return os.path.expanduser(os.path.join(self.base_path, run_id))
+        return os.path.expanduser(self.base_path)
 
     def create(self, run_id) -> Checkpointer:
         keeps = [CheckpointInterval(**k) for k in self.keep]
diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
index 5e595b2a1..bcfcad397 100644
--- a/src/levanter/data/text.py
+++ b/src/levanter/data/text.py
@@ -577,6 +577,8 @@ def tagged_eval_sets(
 class LMDatasetConfig(LMDatasetSourceConfig, LMTaskConfig):
     """This class supports loading data both from HF Datasets and from a raw dataset of jsonl urls"""
 
+    cache_dir: Optional[str] = "cache/"
+
     def train_set(
         self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True, *, key: Optional[PRNGKeyArray] = None
     ) -> AsyncDataset[np.ndarray]:
@@ -705,6 +707,8 @@ def _convert_id_to_token(self, index: int) -> str:
 class LMMixtureDatasetConfig(LMTaskConfig):
     """This class represents a mixture of datasets with their associated weights."""
 
+    cache_dir: Optional[str] = "cache/"
+
     # data source configs and weights
     configs: Dict[str, LMDatasetSourceConfig] = field(default_factory=dict)
     """ configuration of each dataset source (urls, hf dataset id, etc.) """
diff --git a/src/levanter/main/train_lm.py b/src/levanter/main/train_lm.py
index 6c96f8b62..c8316090a 100644
--- a/src/levanter/main/train_lm.py
+++ b/src/levanter/main/train_lm.py
@@ -188,7 +188,11 @@ def main(config: TrainLmConfig):
             callbacks.log_performance_stats(Pos.size, trainer.config.train_batch_size, flops_per_example), every=1
         )
         if config.hf_save_path is not None:
-            full_save_path = os.path.join(config.hf_save_path, trainer.run_id)
+            # bit gross to reach this far into the config, but it's fine
+            if config.trainer.checkpointer.append_run_id_to_base_path:
+                full_save_path = os.path.join(config.hf_save_path, trainer.run_id)
+            else:
+                full_save_path = config.hf_save_path
 
             trainer.add_hook(
                 save_hf_checkpoint_callback(full_save_path, converter, upload_to_hf=config.hf_upload or False),
diff --git a/src/levanter/trainer.py b/src/levanter/trainer.py
index 69c932cd9..8e98eaedb 100644
--- a/src/levanter/trainer.py
+++ b/src/levanter/trainer.py
@@ -519,7 +519,6 @@ class TrainerConfig:
 
     wandb: Optional[tracker.wandb.WandbConfig] = None
     log_dir: Path = Path("logs/")
-    run_base_dir: Path = Path("runs/")
     id: Optional[str] = None  # run id. if None, will be set to a random string
 
     tracker: TrackerConfig | Tuple[TrackerConfig, ...] = field(default_factory=tracker.wandb.WandbConfig)

From 36b29fd56c8191ee8e906ddd0b95eac17964e902 Mon Sep 17 00:00:00 2001
From: William Held <Wbh230@nyu.edu>
Date: Tue, 8 Oct 2024 20:49:34 -0400
Subject: [PATCH 78/94] Update Audio Data Loader to Support Mixture Dataset
 (#758)

Pulls in the New Mixture Features Into Audio Space!

Tested that this fixes the previous epoching errors in the whisper_tiny
config.
---
 config/whisper_tiny_librispeech.yaml     |  14 +-
 docs/tutorials/Training-On-Audio-Data.md |  31 +++--
 src/levanter/data/audio.py               | 157 ++++++++++++++++++++++-
 src/levanter/main/train_asr.py           |  11 +-
 4 files changed, 189 insertions(+), 24 deletions(-)

diff --git a/config/whisper_tiny_librispeech.yaml b/config/whisper_tiny_librispeech.yaml
index 0b13491ae..85815dad8 100644
--- a/config/whisper_tiny_librispeech.yaml
+++ b/config/whisper_tiny_librispeech.yaml
@@ -1,10 +1,15 @@
 data:
-  id: WillHeld/librispeech_parquet
-  cache_dir: "gs://public_data_lev/processed/librispeech"
-  train_split: "train.360"
-  validation_split: "validation"
+  cache_dir: "gs://diva-flash/processed/mixture"
   # The Whisper Tokenizer is way too large for Librispeech
   tokenizer: "facebook/wav2vec2-base-960h"
+  configs:
+    librispeech:
+      id: WillHeld/librispeech_parquet
+      cache_dir: "gs://diva-flash/processed/librispeech"
+      train_split: "train.360"
+      validation_split: "validation"
+  train_weights:
+    librispeech: 1.0
 model:
   type: whisper
   vocab_size: 32
@@ -24,3 +29,4 @@ optimizer:
   learning_rate: 3E-3
   weight_decay: 0.1
   warmup: 0.01
+hf_save_steps: 16000
diff --git a/docs/tutorials/Training-On-Audio-Data.md b/docs/tutorials/Training-On-Audio-Data.md
index d78c6909d..1e5e9575b 100644
--- a/docs/tutorials/Training-On-Audio-Data.md
+++ b/docs/tutorials/Training-On-Audio-Data.md
@@ -43,13 +43,17 @@ you can specify the dataset name in the `data` section of your training configur
 
 ```yaml
 data:
-    id: "WillHeld/librispeech_parquet"
-    # if needed:
-    # name: "subset"
-    train_split: "train.360"
-    validation_split: "validation"
-    text_key: "text"
-    audio_key: "audio"
+  cache_dir: "gs://diva-flash/processed/mixture"
+  # The Whisper Tokenizer is way too large for Librispeech
+  tokenizer: "facebook/wav2vec2-base-960h"
+  configs:
+    librispeech:
+      id: WillHeld/librispeech_parquet
+      cache_dir: "gs://diva-flash/processed/librispeech"
+      train_split: "train.360"
+      validation_split: "validation"
+  train_weights:
+    librispeech: 1.0
 ```
 
 Levanter directly supports the HuggingFace [Audio](https://huggingface.co/docs/datasets/v2.18.0/en/package_reference/main_classes#datasets.Audio) class. Underlying this class is a simple dictionary, which fits into one of the following 3 modes. The first mode is completely pre-processed audio which provides a time-domain `array` of audio data along with a pre-defined `sampling_rate`. The second mode is data which has been loaded into memory as a sequence of `bytes`, but has not been decoded to raw audio data. Finally, if *only* the `path` of the dictionary is defined this points to where the audio file for that example is stored. Levanter will transparently handle all of these modes and process them uniformly to the `array` and `sampling_rate` which is required for downstream modeling.
@@ -101,12 +105,17 @@ Here's a configuration for a Whisper Tiny model with reasonable values for every
 
 ```yaml
 data:
-  id: WillHeld/librispeech_parquet
-  cache_dir: "gs://bucket_for_processed_data/processed/librispeech"
-  train_split: "train.360"
-  validation_split: "validation"
+  cache_dir: "gs://diva-flash/processed/mixture"
   # The Whisper Tokenizer is way too large for Librispeech
   tokenizer: "facebook/wav2vec2-base-960h"
+  configs:
+    librispeech:
+      id: WillHeld/librispeech_parquet
+      cache_dir: "gs://diva-flash/processed/librispeech"
+      train_split: "train.360"
+      validation_split: "validation"
+  train_weights:
+    librispeech: 1.0
 model:
   type: whisper
   vocab_size: 32
diff --git a/src/levanter/data/audio.py b/src/levanter/data/audio.py
index f8a193f04..1fa8ec078 100644
--- a/src/levanter/data/audio.py
+++ b/src/levanter/data/audio.py
@@ -1,4 +1,5 @@
 import abc
+import dataclasses
 import functools
 import logging
 import os
@@ -12,6 +13,7 @@
 import fsspec
 import jax
 import numpy as np
+from draccus import field
 from jaxtyping import PRNGKeyArray
 from typing_extensions import TypedDict
 
@@ -23,6 +25,7 @@
 from levanter.data._preprocessor import BatchProcessor
 from levanter.data.dataset import MappedAsyncDataset
 from levanter.data.metrics_monitor import LoggerMetricsMonitor, LoggingMetricsMonitor, MetricsMonitor
+from levanter.data.mixture import MixtureDataset, StopStrategy
 from levanter.data.sharded_datasource import AudioTextUrlDataSource, ShardedDataSource, WrappedHFDataSource
 from levanter.data.text import BatchTokenizer
 
@@ -30,7 +33,7 @@
 from levanter.logging import silence_transformer_nag
 from levanter.models.asr_model import AudioTextExample
 from levanter.store.cache import CacheOptions, TreeCache, build_or_load_cache
-from levanter.utils.jax_utils import local_cpu_mesh
+from levanter.utils.jax_utils import key_iterator, local_cpu_mesh
 
 
 silence_transformer_nag()  # noqa
@@ -154,8 +157,11 @@ class AudioDatasetSourceConfig:
     audio_key: str = "audio"  # key for the text field in the jsonl file or hf dataset
     sampling_rate: int = 16_000
 
+    train_split: str = "train"
+    validation_split: str = "validation"
     train_urls: List[str] = ()  # type: ignore
     validation_urls: List[str] = ()  # type:ignore
+    cache_dir: str = "cache/"
 
     def get_shard_source(self, split) -> Optional[ShardedDataSource[Tuple[np.ndarray, int, str]]]:
         if self.id is not None:
@@ -218,10 +224,6 @@ def fsspec_expand_glob(url):
 class AudioTaskConfig(abc.ABC):
     processor: str = "openai/whisper-tiny"
     tokenizer: Optional[str] = None
-    # config related to caching
-    train_split: str = "train"
-    validation_split: str = "validation"
-    cache_dir: str = "cache/"
     enforce_bos: bool = True  # whether to append bos even if the tokenizer doesn't
     enforce_eos: bool = True  # whether to append eos even if the tokenizer doesn't
     max_length: int = 448
@@ -250,6 +252,8 @@ def train_set(
         self,
         monitors: Union[bool, List[MetricsMonitor]] = True,
         options: CacheOptions = CacheOptions.default(),
+        *,
+        key: Optional[PRNGKeyArray] = None,
     ) -> AsyncDataset[AudioTextDict]:
         pass
 
@@ -356,7 +360,11 @@ class AudioIODatasetConfig(AudioDatasetSourceConfig, AudioTaskConfig):
     """This class supports loading data both from HF Datasets and from a raw dataset of jsonl urls"""
 
     def train_set(
-        self, monitors: Union[bool, List[MetricsMonitor]] = True, options: CacheOptions = CacheOptions.default()
+        self,
+        monitors: Union[bool, List[MetricsMonitor]] = True,
+        options: CacheOptions = CacheOptions.default(),
+        *,
+        key: Optional[PRNGKeyArray] = None,
     ) -> ProcessedAudioCache:
         ds = self.build_or_load_cache(self.train_split, monitors=monitors)
         if ds is None:
@@ -471,3 +479,140 @@ def _convert_example(inputs: AudioTextDict) -> "AudioTextExample":
     #         for example in self.dataset:
     #             converted_example = _convert_example(example)
     #             yield converted_example
+
+
+@dataclass
+class AudioMixtureDatasetConfig(AudioTaskConfig):
+    """This class represents a mixture of datasets with their associated weights."""
+
+    cache_dir: Optional[str] = "cache/"
+
+    # data source configs and weights
+    configs: Dict[str, AudioDatasetSourceConfig] = field(default_factory=dict)
+    """ configuration of each dataset source (urls, hf dataset id, etc.) """
+    train_weights: Dict[str, float] = field(default_factory=dict)
+    """ weights for each dataset source. They will be normalized to sum to 1. """
+    shuffle: bool | int = False
+    """whether to shuffle the dataset. True means shuffle the whole dataset, False means don't shuffle.
+    If you want to shuffle in eras, set this to the era length"""
+    stop_strategy: str = field(default=StopStrategy.RESTART_STRATEGY)
+    mixture_block_size: int = 2048
+    """ block size for the mixture dataset."""
+
+    def __post_init__(self):
+        if len(self.configs) == 0:
+            raise ValueError("At least one dataset must be provided")
+
+        if set(self.configs.keys()) != set(self.train_weights.keys()):
+            raise ValueError(
+                f"The keys in configs and weights must be the same;got {self.configs.keys()} and"
+                f" {self.train_weights.keys()}"
+            )
+
+    def train_set(
+        self,
+        monitors: Union[bool, List[MetricsMonitor]] = True,
+        options: CacheOptions = CacheOptions.default(),
+        *,
+        key: Optional[PRNGKeyArray] = None,
+    ) -> AsyncDataset[AudioTextDict]:
+        audio_datasets = self.training_sets(monitors)
+
+        if key is None:
+            key = jax.random.PRNGKey(0)
+
+        mix_key, shuffle_key = jax.random.split(key)
+
+        # We shuffle the components and not the overall mixture because this lets us preserve
+        # the "stable batch" property of the mixture dataset.
+        def shuffle_ds(ds, key):
+            if self.shuffle is True:
+                ds = ds.shuffle(key)
+            elif isinstance(self.shuffle, int):
+                ds = ds.era_shuffle(self.shuffle, key=key)
+
+            return ds
+
+        if self.shuffle:
+            out_datasets = {}
+            key_iter = key_iterator(shuffle_key)
+            for name, ds in audio_datasets.items():
+                out_datasets[name] = shuffle_ds(ds, next(key_iter))
+            audio_datasets = out_datasets
+
+        mixture = MixtureDataset(
+            datasets=audio_datasets,
+            weights=self.train_weights,
+            stop_strategy=self.stop_strategy,
+            key=mix_key,
+            block_size=2048,
+        )
+
+        return mixture
+
+    def training_sets(self, monitors: Union[bool, List[MetricsMonitor]] = True) -> Mapping[str, ProcessedAudioCache]:
+        doc_caches = self.build_caches("train", monitors=monitors)
+        return doc_caches
+
+    def validation_sets(
+        self, monitors: Union[bool, List[MetricsMonitor]] = True
+    ) -> Mapping[str, AsyncDataset[np.ndarray]]:
+        doc_caches = self.build_caches("validation", monitors=monitors)
+        return doc_caches
+
+    def build_caches(
+        self, split: str, monitors: Union[bool, List[MetricsMonitor]] = True
+    ) -> Dict[str, ProcessedAudioCache]:
+        # this is a bit gross, but we want to forward all "Task" config fields to the AudioIODatasetConfig for building.
+        # We do this by just grabbing all the fields from the AudioTaskConfig and forwarding them.
+        task_config_fields = set(x.name for x in dataclasses.fields(AudioTaskConfig))
+        task_config_dict = {k: v for k, v in self.__dict__.items() if k in task_config_fields and k != "cache_dir"}
+
+        caches = {}
+        for name, source_config in self.configs.items():
+            weight = self.train_weights.get(name, 0)
+
+            if weight == 0 and split == "train":
+                continue
+
+            source_config_dict = dict(**source_config.__dict__)
+
+            if source_config.cache_dir is None:
+                # replace with the main cache dir/{name}
+                if self.cache_dir is None:
+                    raise ValueError(
+                        "If the 'main' cache_dir is None, then all component cache_dirs must be non-None, but"
+                        f"{name}'s cache_dir is None."
+                    )
+                cache_dir = os.path.join(self.cache_dir, name)
+                source_config_dict["cache_dir"] = cache_dir
+
+            dataset = AudioIODatasetConfig(
+                **source_config_dict,
+                **task_config_dict,
+            )
+            if split == "train":
+                cache = dataset.build_or_load_cache(dataset.train_split, monitors)
+            elif split == "validation":
+                cache = dataset.build_or_load_cache(dataset.validation_split, monitors)
+            else:
+                cache = dataset.build_or_load_cache(split, monitors)
+            # drop the data source and corresponding weight if the cache is not built
+            if cache is None:
+                logger.warning(f"Skipping {name} for split {split} because no source was provided")
+            else:
+                caches[name] = cache
+
+        # in practice it works best if we block on validation caches
+        if split == "validation":
+            for cache in caches.values():
+                cache.cache.await_finished()
+
+        else:
+            logger.info(f"Not waiting for {split} caches to finish building")
+
+        return caches
+
+    @property
+    def sources(self) -> Mapping[str, AudioDatasetSourceConfig]:
+        return self.configs
diff --git a/src/levanter/main/train_asr.py b/src/levanter/main/train_asr.py
index 681a806a6..c0d52eea2 100644
--- a/src/levanter/main/train_asr.py
+++ b/src/levanter/main/train_asr.py
@@ -14,7 +14,7 @@
 import levanter
 from levanter import callbacks
 from levanter.compat.hf_checkpoints import HFCompatConfig, ModelWithHfSerializationMixin, save_hf_checkpoint_callback
-from levanter.data.audio import AudioIODatasetConfig, AudioTextDataset
+from levanter.data.audio import AudioIODatasetConfig, AudioMixtureDatasetConfig, AudioTextDataset
 from levanter.models.asr_model import ASRConfig, AudioTextExample
 from levanter.models.whisper import WhisperConfig
 from levanter.optim import AdamConfig, OptimizerConfig
@@ -27,7 +27,7 @@
 
 @dataclass
 class TrainASRConfig:
-    data: AudioIODatasetConfig = field(default_factory=AudioIODatasetConfig)
+    data: Union[AudioIODatasetConfig, AudioMixtureDatasetConfig] = field(default_factory=AudioMixtureDatasetConfig)
     trainer: TrainerConfig = field(default_factory=TrainerConfig)
     model: ASRConfig = field(default_factory=WhisperConfig)
     optimizer: OptimizerConfig = field(default_factory=AdamConfig)
@@ -37,6 +37,7 @@ class TrainASRConfig:
     initialize_from_hf: Union[bool, str] = False
     """if provided, this will override the model config in the config. if true, use the default hf checkpoint for this model class"""
     use_hf_model_config: bool = False  # if true, replace the model config with the hf config from the checkpoint
+    data_seed: Optional[int] = None  # if provided, will override the data seed from the trainer
 
     # TODO: atm we don't support loading from a checkpoint that has a different tokenizer. this is a bit annoying
     # TODO: atm you have to at least specify a levanter model config with the same type as the hf checkpoint
@@ -113,9 +114,13 @@ def compute_loss(
         Pos = config.model.Pos
         KeyPos = config.model.KeyPos
 
+        if config.data_seed is not None:
+            logger.info(f"Overriding data seed with {config.data_seed}")
+            data_key = jrandom.PRNGKey(config.data_seed)
+
         eval_datasets = config.data.validation_sets()
         train_dataset = AudioTextDataset(
-            config.data.train_set(),
+            config.data.train_set(key=data_key),
             Pos,
             [config.model.Mels, config.model.MelPos],
             KeyPos,

From 5370c72a9cfb1c75b07300aa614b7b038c6dfa6c Mon Sep 17 00:00:00 2001
From: Ahmed Ahmed <ahmedah@stanford.edu>
Date: Wed, 9 Oct 2024 16:57:09 -0400
Subject: [PATCH 79/94] Update src/levanter/data/text.py

Co-authored-by: David Hall <dlwh@cs.stanford.edu>
---
 src/levanter/data/text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
index 664f067dd..acc4ab778 100644
--- a/src/levanter/data/text.py
+++ b/src/levanter/data/text.py
@@ -629,7 +629,7 @@ def preprocess_supervised_example(batch, tokenizer: PreTrainedTokenizerBase):
     sources = [example["input"] for example in batch]
 
     targets = [f"{example['output']}" for example in batch]
-    # TODO: this seems pretty wasteful since you end up tokenizing twice, but it's how the original code does it.
+    # TODO: this seems pretty wasteful since you end up tokenizing twice, but it's how alpaca does it.
     examples = [s + t for s, t in zip(sources, targets)]
     sources_tokenized = tokenizer(sources, padding=False, truncation=True)
     examples_tokenized = tokenizer(examples, padding=False, truncation=True)

From 2f625d3c952eb3e933a9834b8beef3a9bf9aafc0 Mon Sep 17 00:00:00 2001
From: Ahmed Ahmed <ahmedah@stanford.edu>
Date: Wed, 9 Oct 2024 14:02:17 -0700
Subject: [PATCH 80/94] address david's comments

---
 src/levanter/data/text.py          | 43 +++---------------------------
 src/levanter/utils/fsspec_utils.py | 17 +++++++++++-
 2 files changed, 19 insertions(+), 41 deletions(-)

diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
index 861a017b0..fdd935d82 100644
--- a/src/levanter/data/text.py
+++ b/src/levanter/data/text.py
@@ -10,7 +10,7 @@
 from itertools import chain
 from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple, TypeVar, Union
 
-import braceexpand
+
 import datasets
 import equinox as eqx
 import fsspec
@@ -38,6 +38,7 @@
 from levanter.store.jagged_array import JaggedArrayStore
 from levanter.store.tree_store import TreeStore
 from levanter.utils.hf_utils import num_cpus_used_by_tokenizer
+from levanter.utils.fsspec_utils import fsspec_expand_glob
 
 
 silence_transformer_nag()  # noqa
@@ -378,20 +379,6 @@ def num_gpus(self) -> int:
         return 0
 
 
-def fsspec_expand_glob(url):
-    expanded_urls = braceexpand.braceexpand(url)
-    for expanded_url in expanded_urls:
-        if "*" in expanded_url:
-            fs = fsspec.core.url_to_fs(expanded_url)[0]
-            globbed = fs.glob(expanded_url)
-            # have to append the fs prefix back on
-            protocol, _ = fsspec.core.split_protocol(expanded_url)
-            if protocol is None:
-                yield from globbed
-            else:
-                yield from [f"{protocol}://{path}" for path in globbed]
-        else:
-            yield expanded_url
 
 
 def concatenate_and_group_texts(
@@ -578,7 +565,7 @@ def tagged_eval_sets(
 
 
 @dataclass
-class LMSupervisedDatasetConfig(LMDatasetSourceConfig):
+class LMSupervisedDatasetConfig:
     """This class represents a dataset source with URLs or hf name/id."""
 
     cache_dir: str = "cache/"
@@ -589,30 +576,6 @@ class LMSupervisedDatasetConfig(LMDatasetSourceConfig):
 
     validation_urls: List[str] = ()  # type:ignore
 
-    # def token_seq_dataset(
-    #     self, split: str, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True
-    # ) -> Optional[TokenSeqDataset]:
-    #     cache = self.build_or_load_cache(split, monitors=monitors)
-    #     if cache is None:
-    #         return None
-    #     return TokenSeqDataset(cache, seq_len)
-
-    # def validation_set(
-    #     self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True
-    # ) -> Optional[TokenSeqDataset]:
-    #     return self.token_seq_dataset("validation", seq_len, monitors)
-
-    # def validation_sets(
-    #     self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True
-    # ) -> Mapping[str, AsyncDataset[np.ndarray]]:
-    #     validation_set = self.validation_set(seq_len, monitors)
-    #     if validation_set is not None:
-    #         return {"": validation_set}
-    #     else:
-    #         return {}
-
-    # Add tagged eval set with split for auxiliary and validation dataset
-
 
 def preprocess_supervised_example(batch, tokenizer: PreTrainedTokenizerBase):
     sources = [example["input"] for example in batch]
diff --git a/src/levanter/utils/fsspec_utils.py b/src/levanter/utils/fsspec_utils.py
index 896ea8450..6a1341bff 100644
--- a/src/levanter/utils/fsspec_utils.py
+++ b/src/levanter/utils/fsspec_utils.py
@@ -1,5 +1,5 @@
 import fsspec
-
+import braceexpand
 
 def exists(url, **kwargs) -> bool:
     """Check if a file exists on a remote filesystem."""
@@ -11,3 +11,18 @@ def mkdirs(path):
     """Create a directory and any necessary parent directories."""
     fs, path = fsspec.core.url_to_fs(path)
     fs.makedirs(path, exist_ok=True)
+
+def fsspec_expand_glob(url):
+    expanded_urls = braceexpand.braceexpand(url)
+    for expanded_url in expanded_urls:
+        if "*" in expanded_url:
+            fs = fsspec.core.url_to_fs(expanded_url)[0]
+            globbed = fs.glob(expanded_url)
+            # have to append the fs prefix back on
+            protocol, _ = fsspec.core.split_protocol(expanded_url)
+            if protocol is None:
+                yield from globbed
+            else:
+                yield from [f"{protocol}://{path}" for path in globbed]
+        else:
+            yield expanded_url
\ No newline at end of file

From cf2c9e5b20714b2cdd051f60ff4213982fb4c497 Mon Sep 17 00:00:00 2001
From: Ahmed Ahmed <ahmedah@stanford.edu>
Date: Wed, 9 Oct 2024 14:08:32 -0700
Subject: [PATCH 81/94] lint and minor

---
 scripts/launch_gpt2_small_fast_supervised_tpu.sh | 6 ++++++
 scripts/launch_gpt2_small_fast_tpu.sh            | 2 +-
 src/levanter/data/text.py                        | 2 --
 src/levanter/utils/fsspec_utils.py               | 4 +++-
 4 files changed, 10 insertions(+), 4 deletions(-)
 create mode 100644 scripts/launch_gpt2_small_fast_supervised_tpu.sh

diff --git a/scripts/launch_gpt2_small_fast_supervised_tpu.sh b/scripts/launch_gpt2_small_fast_supervised_tpu.sh
new file mode 100644
index 000000000..df38aec99
--- /dev/null
+++ b/scripts/launch_gpt2_small_fast_supervised_tpu.sh
@@ -0,0 +1,6 @@
+# Launches the "gpt_small_fast" model on a TPU node
+
+python infra/launch.py --foreground --tpu_name $(whoami)-levanter-itest-32 --zone us-central2-b --tpu_type v4-32 --preemptible -- \
+    python -m levanter.main.train_lm \
+    --config_path config/gpt2_small_fast_supervised.yaml \
+    --trainer.checkpointer.base_path gs://levanter-checkpoints/gpt-itest/ --trainer.checkpointer.save_interval 30m $*
diff --git a/scripts/launch_gpt2_small_fast_tpu.sh b/scripts/launch_gpt2_small_fast_tpu.sh
index df38aec99..0c09cdcfa 100644
--- a/scripts/launch_gpt2_small_fast_tpu.sh
+++ b/scripts/launch_gpt2_small_fast_tpu.sh
@@ -2,5 +2,5 @@
 
 python infra/launch.py --foreground --tpu_name $(whoami)-levanter-itest-32 --zone us-central2-b --tpu_type v4-32 --preemptible -- \
     python -m levanter.main.train_lm \
-    --config_path config/gpt2_small_fast_supervised.yaml \
+    --config_path config/gpt2_small_fast.yaml \
     --trainer.checkpointer.base_path gs://levanter-checkpoints/gpt-itest/ --trainer.checkpointer.save_interval 30m $*
diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
index fdd935d82..dfd16f844 100644
--- a/src/levanter/data/text.py
+++ b/src/levanter/data/text.py
@@ -379,8 +379,6 @@ def num_gpus(self) -> int:
         return 0
 
 
-
-
 def concatenate_and_group_texts(
     encoding: BatchEncoding,
     seq_len: int,
diff --git a/src/levanter/utils/fsspec_utils.py b/src/levanter/utils/fsspec_utils.py
index 6a1341bff..452ab3d84 100644
--- a/src/levanter/utils/fsspec_utils.py
+++ b/src/levanter/utils/fsspec_utils.py
@@ -1,6 +1,7 @@
 import fsspec
 import braceexpand
 
+
 def exists(url, **kwargs) -> bool:
     """Check if a file exists on a remote filesystem."""
     fs, path = fsspec.core.url_to_fs(url, **kwargs)
@@ -12,6 +13,7 @@ def mkdirs(path):
     fs, path = fsspec.core.url_to_fs(path)
     fs.makedirs(path, exist_ok=True)
 
+
 def fsspec_expand_glob(url):
     expanded_urls = braceexpand.braceexpand(url)
     for expanded_url in expanded_urls:
@@ -25,4 +27,4 @@ def fsspec_expand_glob(url):
             else:
                 yield from [f"{protocol}://{path}" for path in globbed]
         else:
-            yield expanded_url
\ No newline at end of file
+            yield expanded_url

From adf4b6d46545f1f95b5eba41dfddcaec68b553c4 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Thu, 10 Oct 2024 10:53:08 -0500
Subject: [PATCH 82/94] Add an actor pool for batch processing, switch to a
 thread for writing batches instead of a ray actor/task (#757)

About a 5x speedup. Memory usage isn't super well controlled in mixtures
and that needs some work
---
 config/data/pile_mixture.yaml      |  15 +-
 src/levanter/data/_preprocessor.py |  72 ++++++-
 src/levanter/data/audio.py         |   8 +-
 src/levanter/data/text.py          |   1 +
 src/levanter/main/cache_dataset.py |   1 +
 src/levanter/store/cache.py        | 303 ++++++++++++++++-------------
 src/levanter/store/jagged_array.py | 121 +++++++++---
 src/levanter/store/tree_store.py   |  67 ++-----
 src/levanter/utils/actor_pool.py   | 224 +++++++++++++++++++++
 src/levanter/utils/thread_utils.py |  24 +++
 tests/test_actor_pool.py           | 167 ++++++++++++++++
 tests/test_jagged_array.py         |  71 ++++++-
 tests/test_new_cache.py            |  58 ------
 tests/test_tree_store.py           |  28 +++
 14 files changed, 874 insertions(+), 286 deletions(-)
 create mode 100644 src/levanter/utils/actor_pool.py
 create mode 100644 tests/test_actor_pool.py

diff --git a/config/data/pile_mixture.yaml b/config/data/pile_mixture.yaml
index ff75b8941..38b545f66 100644
--- a/config/data/pile_mixture.yaml
+++ b/config/data/pile_mixture.yaml
@@ -1,5 +1,8 @@
 cache_dir: "gs://levanter-data/tokenized/pile-domains/"
 tokenizer: "EleutherAI/gpt-neox-20b"
+cache_options:
+  batch_size: 32
+  num_shard_groups: 16
 configs:
   arxiv:
     train_urls:
@@ -11,11 +14,11 @@ configs:
       - gs://levanter-data/pile-domains/books2/{00..29}.jsonl.zst
     validation_urls:
       - gs://levanter-data/pile-domains/books2/val.jsonl.zst
-  books3:
-    train_urls:
-      - gs://levanter-data/pile-domains/books3/{00..29}.jsonl.zst
-    validation_urls:
-      - gs://levanter-data/pile-domains/books3/val.jsonl.zst
+#  books3:
+#    train_urls:
+#      - gs://levanter-data/pile-domains/books3/{00..29}.jsonl.zst
+#    validation_urls:
+#      - gs://levanter-data/pile-domains/books3/val.jsonl.zst
   dm_math:
     train_urls:
       - gs://levanter-data/pile-domains/dm_math/{00..29}.jsonl.zst
@@ -115,7 +118,7 @@ train_weights:
   # these weights come from the paper https://arxiv.org/pdf/2101.00027.pdf
   pile_cc: 0.1811
   pubmed_central: 0.1440
-  books3: 0.1207
+#  books3: 0.1207
   owt2: 0.1001
   arxiv: 0.0896
   github: 0.0759
diff --git a/src/levanter/data/_preprocessor.py b/src/levanter/data/_preprocessor.py
index 3c1f77494..573015852 100644
--- a/src/levanter/data/_preprocessor.py
+++ b/src/levanter/data/_preprocessor.py
@@ -1,8 +1,13 @@
+import logging
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, Generic, Iterable, Mapping, Sequence, TypeVar, Union
 
 import numpy as np
 import pyarrow as pa
+import ray
+
+from levanter.utils.actor_pool import AutoScalingActorPool, PoolWorkerBase
+from levanter.utils.ray_utils import RefBox
 
 
 T = TypeVar("T")
@@ -143,12 +148,12 @@ def rec(dataset):
 
     source, transforms, batch_transform = rec(dataset)
 
-    batch_size = batch_transform.batch_size if batch_transform is not None else 1024
+    # batch_size = batch_transform.batch_size if batch_transform is not None else 1024
     cpus = batch_transform.num_cpus if batch_transform is not None else 1
     gpus = batch_transform.num_gpus if batch_transform is not None else 0
     resources = batch_transform.resources if batch_transform is not None else {}
 
-    return source, _CompositeBatchProcessor(transforms, batch_size, cpus, gpus, resources)
+    return source, _CompositeBatchProcessor(transforms, cpus, gpus, resources)
 
 
 class _CompositeBatchProcessor(BatchProcessor):
@@ -157,7 +162,6 @@ def __init__(self, transforms, num_cpus, num_gpus, resources):
         self._num_cpus = num_cpus
         self._num_gpus = num_gpus
         self._resources = resources
-        self._batch_size = batch_size
 
     @property
     def batch_size(self):
@@ -230,3 +234,65 @@ def to_hf_batched(x):
             return x
 
     return {b.field(i).name: to_hf_batched(b.column(i).to_numpy(zero_copy_only=False)) for i in range(b.num_columns)}
+
+
+@ray.remote(num_cpus=0)
+class BatchProcessorPool:
+    def __init__(self, processor: BatchProcessor, min_size: int = 1, max_size: int = 10):
+        logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(message)s")
+        processor_ref = ray.put(processor)
+        self.actor_pool = AutoScalingActorPool(
+            lambda: _create_batch_processor_actor(processor, processor_ref), min_size, max_size
+        )
+
+    async def process_batch(self, batch_ref: RefBox):
+        return await self.actor_pool.submit(
+            lambda a, b: a.process_batch.remote(b), batch_ref.ref, obj_ref=batch_ref.ref
+        )
+
+    def num_pending_tasks(self):
+        return self.actor_pool.num_pending_tasks
+
+
+def _create_batch_processor_actor(processor: BatchProcessor, processor_ref):
+    cpus = processor.num_cpus
+    gpus = processor.num_gpus
+    resources = processor.resources
+    return _BatchProcessorActor.options(  # type: ignore
+        num_cpus=cpus, num_gpus=gpus, resources=resources, scheduling_strategy="SPREAD"
+    ).remote(processor_ref)
+
+
+@ray.remote
+class _BatchProcessorActor(PoolWorkerBase):
+    def __init__(self, processor: BatchProcessor):
+        from levanter.store.tree_store import TreeBatchPreparer
+
+        self.processor = processor
+        self.preparer = TreeBatchPreparer(processor.output_exemplar)
+
+    def process_batch(self, batch):
+        result = self.processor(batch)
+        result = _canonicalize_batch(result)
+        prepared = self.preparer(result)
+        return prepared
+
+
+def _canonicalize_batch(batch: Union[dict, list[dict]]) -> list[dict]:
+    if isinstance(batch, pa.RecordBatch):
+        batch = dict_from_record_batch(batch)
+
+    if isinstance(batch, dict):
+        return _to_list_of_dicts(batch)
+    else:
+        return batch
+
+
+def _to_list_of_dicts(batch: dict) -> list[dict]:
+    """
+    Convert a batch of dictionaries to a list of dictionaries, suitable for writing to a cache.
+    """
+    keys = list(batch.keys())
+    values = list(batch.values())
+    num_rows = len(values[0])
+    return [{key: values[i][j] for i, key in enumerate(keys)} for j in range(num_rows)]
diff --git a/src/levanter/data/audio.py b/src/levanter/data/audio.py
index 1fa8ec078..12695a20b 100644
--- a/src/levanter/data/audio.py
+++ b/src/levanter/data/audio.py
@@ -305,6 +305,7 @@ def build_or_load(
         override_resources=None,
         max_length=448,
         cache_options: CacheOptions = CacheOptions.default(),
+        split: str = "",
     ) -> "ProcessedAudioCache":
         bp = BatchAudioProcessor(
             processor,
@@ -316,12 +317,7 @@ def build_or_load(
         )
         monitors = monitors or []
         cache = build_or_load_cache(
-            cache_dir,
-            source,
-            bp,
-            await_finished=await_finished,
-            monitors=monitors,
-            options=cache_options,
+            cache_dir, source, bp, await_finished=await_finished, monitors=monitors, options=cache_options, split=split
         )
         if cache.is_finished:
             logger.info(f"Cache {cache_dir} is complete.")
diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
index dfd16f844..f2a3b8497 100644
--- a/src/levanter/data/text.py
+++ b/src/levanter/data/text.py
@@ -735,6 +735,7 @@ def build_or_load_cache(
             monitors=monitors,
             await_finished=False,
             options=self.cache_options,
+            split=split,
         )
 
 
diff --git a/src/levanter/main/cache_dataset.py b/src/levanter/main/cache_dataset.py
index caccc567c..92471e997 100644
--- a/src/levanter/main/cache_dataset.py
+++ b/src/levanter/main/cache_dataset.py
@@ -48,6 +48,7 @@ def main(args: RayCachedLMDatasetConfig):
             processor=batch_tokenizer,
             await_finished=False,
             monitors=monitors,
+            split=split,
         )
 
         cache.await_finished()
diff --git a/src/levanter/store/cache.py b/src/levanter/store/cache.py
index eae9f8402..c0bda78f9 100644
--- a/src/levanter/store/cache.py
+++ b/src/levanter/store/cache.py
@@ -12,25 +12,35 @@
 from concurrent.futures import Future as threading_Future
 from contextlib import AbstractContextManager
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, TypeVar, Union
+from typing import Any, Callable, Dict, Iterator, List, Mapping, Optional, Sequence, TypeVar, Union
 
 import deepdiff
 import fsspec.core
+import jax
 import pyarrow as pa
 import ray
 from dataclasses_json import dataclass_json
 from fsspec import AbstractFileSystem
+from jaxtyping import PyTree
 from ray.actor import ActorHandle
-from ray.remote_function import RemoteFunction
 
 from levanter.data.dataset import AsyncDataset
 from levanter.store._prefetch_actor import QueueEmpty, RayPrefetchQueue
 from levanter.utils.py_utils import Stopwatch
 
-from ..data._preprocessor import BatchProcessor, BatchResult, dict_from_record_batch
+from ..data._preprocessor import BatchProcessor, BatchProcessorPool, BatchResult, dict_from_record_batch
 from ..data.metrics_monitor import InProgressCacheMetrics, LoggerMetricsMonitor, MetricsMonitor
 from ..data.sharded_datasource import ShardedDataSource
-from ..utils.ray_utils import ExceptionInfo, SnitchRecipient, current_actor_handle, log_failures_to, ser_exc_info
+from ..utils.ray_utils import (
+    ExceptionInfo,
+    RefBox,
+    SnitchRecipient,
+    current_actor_handle,
+    log_failures_to,
+    ser_exc_info,
+)
+from ..utils.thread_utils import ExceptionTrackingThread
+from .jagged_array import PreparedBatch
 from .tree_store import TreeStore
 
 
@@ -43,7 +53,7 @@
 LEDGER_FILE_NAME = "shard_ledger.json"
 
 DEFAULT_LOG_LEVEL = pylogging.INFO
-LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
 
 
 @dataclass_json
@@ -66,6 +76,13 @@ class CacheOptions:
     """The batch size to use when processing the data. This is used to control the memory usage of the cache building
     process. Lower values will use less memory but take somewhat longer to build the cache."""
 
+    # the below options don't actually impact the cache's result, but do impact construction
+    num_batches_per_flush = 256
+    """The number of batches to process before flushing the cache to disk. This is used to control the memory usage of
+    the cache building process. Lower values will use less memory but may take somewhat longer to build the cache."""
+    prefetch_per_group: int = 4
+    """The number of batches to prefetch per group. This is used to keep the processors busy and to reduce the time"""
+
     @staticmethod
     def default():
         return CacheOptions()
@@ -95,6 +112,7 @@ def build_or_load_cache(
     monitors: Optional[Sequence["MetricsMonitor"]] = None,
     options: CacheOptions = CacheOptions.default(),
     force_flush: bool = False,
+    split: str = "test",
 ) -> "TreeCache[U]":
     """
     Produces a sharded cache of the dataset using Ray for distributed processing. The cache can be any path
@@ -134,6 +152,7 @@ def build_or_load_cache(
         processor=processor,
         options=options,
         force_flush=force_flush,
+        split=split,
     )
 
     if cache.is_finished:
@@ -297,6 +316,7 @@ def build_or_load(
         processor: BatchProcessor[T, U],
         options: Optional["CacheOptions"] = None,
         force_flush: bool = False,
+        split: str = "test",
     ) -> "TreeCache[U]":
         if options is None:
             options = CacheOptions.default()
@@ -310,6 +330,7 @@ def build_or_load(
                 processor=processor,
                 options=options,
                 force_flush=force_flush,
+                split=split,
             )
             return TreeCache(cache_dir=cache_dir, exemplar=processor.output_exemplar, ledger=None, _broker=broker)
 
@@ -585,9 +606,9 @@ def write_batch(self, batch: BatchResult):
         if isinstance(batch, pa.RecordBatch):
             raise NotImplementedError("Only non-RecordBatch batches are supported for now")
 
-        batch = _canonicalize_batch(batch)  # type: ignore
+        cbatch = _canonicalize_batch(batch)  # type: ignore
 
-        self._tree_store.extend(batch)
+        self._tree_store.extend(cbatch)
 
 
 class ShardedCacheWriter:
@@ -612,7 +633,6 @@ def __init__(
 
         self._tree_store = TreeStore.open(exemplar, self.cache_dir, mode="a")  # type: ignore
         self._tree_store.trim_to_size(self._ledger.total_num_rows)
-        self._items_ready_to_write: list = []
 
     @property
     def ledger(self):
@@ -627,7 +647,6 @@ def is_finished(self):
         return self._ledger.is_finished
 
     def finish_shard(self, shard_name: str, num_rows: int):
-        self.flush()
         current_rows = self._ledger.shard_rows.get(shard_name, 0)
         if current_rows != num_rows:
             raise ValueError(f"Expected {num_rows} rows in finished shard {shard_name}, but found {current_rows}")
@@ -635,6 +654,21 @@ def finish_shard(self, shard_name: str, num_rows: int):
         self._ledger.finished_shards.append(shard_name)
         self._ledger._serialize_and_commit(self.cache_dir)
 
+    def write_prepared_batch(self, shard_counts: Mapping[str, int], batch: PyTree[PreparedBatch]):
+        if self.is_finished:
+            raise RuntimeError("Cannot write to a finished cache")
+        self._tree_store.extend_with_batch(batch)
+
+        for shard, num_rows in shard_counts.items():
+            self._ledger.shard_rows[shard] = self._ledger.shard_rows.get(shard, 0) + num_rows
+
+        total_rows = self._ledger.total_num_rows + sum(shard_counts.values())
+        self._ledger.total_num_rows = total_rows
+        self._ledger._serialize_and_commit(self.cache_dir)
+
+        if self._on_write:
+            self._on_write(self._ledger)
+
     def write_batch(self, shard_name: str, batch: BatchResult):
         if self.is_finished:
             raise RuntimeError("Cannot write to a finished cache")
@@ -643,15 +677,11 @@ def write_batch(self, shard_name: str, batch: BatchResult):
             raise NotImplementedError("Only non-RecordBatch batches are supported for now")
 
         batch = _canonicalize_batch(batch)  # type: ignore
+        prepared = self._tree_store.batch_preparer(batch)
 
-        self._items_ready_to_write.append((shard_name, batch))
-
-    def flush(self):
-        self._attempt_to_write_batches()
+        return self.write_prepared_batch({shard_name: len(batch)}, prepared)
 
     def finish(self):
-        self.flush()
-
         # if successful, write the ledger
         logger.info("Finished writing cache")
         self._ledger.is_finished = True
@@ -661,59 +691,28 @@ def finish(self):
 
         return self._tree_store
 
-    def _attempt_to_write_batches(self):
-        if self._ledger.is_finished:
-            return
-
-        if not self._items_ready_to_write:
-            return
-
-        updated_shards = self._write_available_batches()
-
-        logger.debug(f"Updated shards: {updated_shards}")
-
-        did_write = len(updated_shards) > 0
-
-        if did_write:
-
-            for shard, num_rows in updated_shards.items():
-                self._ledger.shard_rows[shard] = self._ledger.shard_rows.get(shard, 0) + num_rows
-
-            total_rows = self._ledger.total_num_rows + sum(updated_shards.values())
-            self._ledger.total_num_rows = total_rows
-            self._ledger._serialize_and_commit(self.cache_dir)
-
-            if self._on_write:
-                self._on_write(self._ledger)
-
-    def _write_available_batches(self):
-        ready = self._items_ready_to_write
-        self._items_ready_to_write = []
-
-        if len(ready) == 0:
-            return {}
-
-        to_write = []
-        written_by_shard = {}
-        for shard, batch in ready:
-            to_write.extend(batch)
-            written_by_shard[shard] = written_by_shard.get(shard, 0) + len(batch)
-
-        self._tree_store.extend(to_write)
-        return written_by_shard
-
 
 def _serialize_json_and_commit(path, obj):
     # just to be paranoid, we write to a temp file and then rename it
     # TODO: probably we could do better here
-    with fsspec.open(f"{path}.tmp", "w") as file:
-        file.write(obj.to_json())
-    # now copy the old file to a backup
     fs: AbstractFileSystem = fsspec.core.url_to_fs(path)[0]
     fs.mkdirs(os.path.dirname(path), exist_ok=True)
     if fs.exists(path):
+        # copy the old file to a backup
         fs.copy(path, f"{path}.bak")
-    fs.rename(f"{path}.tmp", path)
+
+    for i in range(10):
+        with fsspec.open(f"{path}.tmp", "w") as file:
+            file.write(obj.to_json())
+
+        try:
+            fs.rename(f"{path}.tmp", path)
+            break
+        except FileNotFoundError:
+            # this happens for some reason sometimes. It makes no sense.
+            # FileNotFoundError: b/levanter-data/o/scratch%2Fdlwh%2Fpile-YYY%2Fpubmed_abs%2Ftrain%2Fshard_ledger.json.tmp/rewriteTo/b/levanter-data/o/scratch%2Fdlwh%2Fpile-YYY%2Fpubmed_abs%2Ftrain%2Fshard_ledger.json
+            logger.exception(f"Failed to rename {path}.tmp to {path}")
+            pass
 
 
 @ray.remote(num_cpus=0.1)  # keep this small b/c it doesn't do a lot
@@ -728,6 +727,7 @@ def __init__(
         self,
         cache_dir: str,
         name: str,
+        split: str,  # to workaround https://github.com/ray-project/ray/issues/44083
         source: ShardedDataSource[T],
         processor: BatchProcessor[T, U],
         options: CacheOptions,
@@ -754,9 +754,9 @@ def __init__(
             if self._ledger.is_finished:
                 self.logger.info("Cache already finished. Nothing to do.")
                 return
-            self._cache_writer = _core_writer_task.remote(
-                current_actor_handle(), cache_dir, self._ledger, source, processor, force_flush
-            )
+            self._cache_writer = _core_writer_task.options(
+                name=f"writer::{path_for_name}", scheduling_strategy="SPREAD"
+            ).remote(current_actor_handle(), cache_dir, split, self._ledger, source, processor, force_flush)
         except Exception:
             # Ray behaves poorly if the constructor of an actor fails, so we catch and log here
             # this also propagates to the finished promise, so we can handle it there
@@ -849,13 +849,14 @@ async def _do_notify_async():
         asyncio.create_task(_do_notify_async())
 
 
-def _get_builder_actor(cache_dir, shard_source, processor, options=CacheOptions.default(), force_flush=False):
-    name = f"lev_cache_manager::{cache_dir}"
+def _get_builder_actor(split, cache_dir, shard_source, processor, options=CacheOptions.default(), force_flush=False):
+    name = f"lev_cache_manager::{split}::{cache_dir}"
     path_for_name = os.path.join(*os.path.split(cache_dir)[-2:])
     name_for_display = f"builder::{path_for_name}"
 
     return _TreeStoreCacheBuilder.options(name=name, get_if_exists=True).remote(  # type: ignore
         name=name_for_display,
+        split=split,
         cache_dir=cache_dir,
         source=shard_source,
         processor=processor,
@@ -906,7 +907,6 @@ class _ShardFinished:
 """
 
 _TIME_BETWEEN_WRITES = 20.0  # seconds
-_MAX_WRITE_BATCHES = 1000
 _MIN_WRITE_BATCHES = 100
 
 
@@ -914,6 +914,7 @@ class _ShardFinished:
 def _core_writer_task(
     parent,
     cache_dir,
+    split,
     initial_ledger: CacheLedger,
     source: ShardedDataSource,
     processor,
@@ -928,24 +929,30 @@ def _core_writer_task(
         * processing of the batches
         * writing of the batches to the cache
     """
-    logger.setLevel(DEFAULT_LOG_LEVEL)
+    pylogging.basicConfig(level=DEFAULT_LOG_LEVEL, format=LOG_FORMAT)
     logger.info("Starting writer task")
 
     name = str(os.path.join(*cache_dir.split("/")[-2:]))
     # append a small random number to the name to avoid collisions
     name += f"::{random.randint(0, 1000)}"
 
-    def on_write(ledger):
-        ray.get(parent._notify_updated_ledger.remote(ledger))
-
     with log_failures_to(parent):
-        sharded_cache_writer = ray.remote(ShardedCacheWriter).remote(
+
+        def on_write(ledger):
+            ray.get(parent._notify_updated_ledger.remote(ledger))
+
+        sharded_cache_writer = ShardedCacheWriter(
             cache_dir, initial_ledger, processor.output_exemplar, on_write=on_write
         )
 
+        options = initial_ledger.metadata.options
+        num_groups = min(options.num_shard_groups or 1000000, len(source.shard_names))
+
+        processor_pool = _mk_processor_pool(split, processor, 0, num_groups * 4)
+
         interleave: RayPrefetchQueue = RayPrefetchQueue(
-            lambda: _make_interleave(name, source, initial_ledger, processor),
-            4096,
+            lambda: _make_interleave(name, source, initial_ledger, processor_pool),
+            512,
             producer_options={"num_cpus": 1, "name": f"{name}::interleave"},
         )
 
@@ -955,34 +962,35 @@ def on_write(ledger):
         flush_time = Stopwatch()
         flush_amortized_time = Stopwatch()
 
-        i = 0
-        batches_since_last_write = 0
+        batches: list = []
         time_of_last_write = time.time()
-        last_flush_future: Optional[ray.ObjectRef] = None
-        # start_of_last_flush = time_of_last_write
+        batches_total = 0.0
+        flush_thread = None
+        finished_shards_last_flush: list = []
 
-        # for i, batch_box in enumerate(interleave):
         while True:
-            with total_time:  # 0.014
+            with total_time:  # 0.0051
                 try:
                     cur_time = time.time()
                     time_since_last_write = cur_time - time_of_last_write
                     remaining_time = _TIME_BETWEEN_WRITES - time_since_last_write
 
-                    if batches_since_last_write > 0:
-                        with flush_amortized_time:
-                            if remaining_time <= 0 or batches_since_last_write >= _MAX_WRITE_BATCHES or force_flush:
-                                with flush_time:
-                                    # TODO: don't block?
-                                    if last_flush_future:
-                                        ray.get(last_flush_future)
-                                        # print(
-                                        #     f"Flushed {batches_since_last_write} batches in"
-                                        #     f" {time.time() - start_of_last_flush} seconds"
-                                        # )
-                                    last_flush_future = sharded_cache_writer.flush.remote()
-                                    # start_of_last_flush = time.time()
-                                    batches_since_last_write = 0
+                    if len(batches) > 0:
+                        with flush_amortized_time:  # 6e-4
+                            if remaining_time <= 0 or len(batches) >= options.num_batches_per_flush or force_flush:
+                                with flush_time:  # 0.613s
+                                    shard_rows, payloads = _fetch_batches(batches)
+                                    if flush_thread is not None:
+                                        flush_thread.join()
+
+                                    batches = []
+                                    flush_thread = ExceptionTrackingThread(
+                                        target=_write_batches,
+                                        args=(sharded_cache_writer, shard_rows, payloads, finished_shards_last_flush),
+                                    )
+                                    flush_thread.start()
+
+                                    finished_shards_last_flush = []
                                     time_of_last_write = time.time()
                                     continue
                     else:
@@ -998,18 +1006,26 @@ def on_write(ledger):
                     with append_time:
                         match message:
                             case _Batch(shard, _, payload):
-                                # TODO: ensure indices are what we expect
-                                sharded_cache_writer.write_batch.remote(shard, payload)
-                                batches_since_last_write += 1
-                                i += 1
+                                batches_total += 1
+                                batches.append((shard, payload))
+
+                                if force_flush:
+                                    shard_rows, payloads = _fetch_batches(batches)
+                                    del batches
+                                    _write_batches(
+                                        sharded_cache_writer, shard_rows, payloads, finished_shards_last_flush
+                                    )
+                                    batches = []
+                                    finished_shards_last_flush = []
+
                             case _ShardFinished(shard, total_rows):
-                                ray.get(sharded_cache_writer.finish_shard.remote(shard, total_rows))
+                                finished_shards_last_flush.append((shard, total_rows))
                             case _:
                                 raise AssertionError(f"Unexpected message type {type(message)}")
 
-                    # if i % 1000 == 0:
+                    # if batches_total % 1000 == 0:
                     #     print(
-                    #         f"Processed {i} batches: {loading_time.average()}s load,"
+                    #         f"Processed {batches_total} batches: {loading_time.average()}s load,"
                     #         f" {append_time.average()}s append, {flush_time.average()}s flush blocked, "
                     #         f"{flush_amortized_time.average()}s amortized flush, "
                     #         f"{total_time.average()}s total"
@@ -1021,12 +1037,43 @@ def on_write(ledger):
                     logger.exception("Error while processing batch")
                     raise e
 
-        sharded_cache_writer.finish.remote()
+        # force a flush
+        if len(batches) > 0:
+            shard_row_totals, payloads_for_batches = _fetch_batches(batches)
+            del batches
+            if flush_thread is not None:
+                flush_thread.join()
+            _write_batches(sharded_cache_writer, shard_row_totals, payloads_for_batches, finished_shards_last_flush)
+
+        sharded_cache_writer.finish()
 
-        out = sharded_cache_writer.get_ledger.remote()
+        out = sharded_cache_writer.get_ledger()
         return out
 
 
+def _write_batches(writer: ShardedCacheWriter, shard_totals, batches, finished_shards):
+    # concatenate the payloads
+    final_payload = jax.tree.map(lambda *bs: PreparedBatch.concat(bs), *batches)
+    writer.write_prepared_batch(shard_totals, final_payload)
+
+    for shard, total_rows in finished_shards:
+        writer.finish_shard(shard, total_rows)
+
+
+def _fetch_batches(batches) -> tuple[dict[str, int], list[PreparedBatch]]:
+    time_in = time.time()
+    shards_for_batches, payloads_for_batches = zip(*batches)
+    payloads_for_batches = ray.get(list(payloads_for_batches))
+    time_out = time.time()
+    logger.info(f"Fetched {len(batches)} batches in {time_out - time_in} seconds")
+
+    shard_row_totals: dict[str, int] = {}
+    for shard, payload in zip(shards_for_batches, payloads_for_batches):
+        shard_row_totals[shard] = shard_row_totals.get(shard, 0) + jax.tree.leaves(payload)[0].num_rows
+
+    return shard_row_totals, payloads_for_batches
+
+
 def _interleave_shards(readers: Sequence[RayPrefetchQueue], first_index: int) -> Iterator[T]:  # _Message
     """
     Interleaves the results of multiple iterators. To support resume,
@@ -1110,7 +1157,7 @@ def _impute_total_rows_committed_and_check_invariants(self):
         return total_committed, all_finished
 
 
-def _make_interleave(name: str, source: ShardedDataSource, initial_ledger: CacheLedger, processor: BatchProcessor):
+def _make_interleave(name: str, source: ShardedDataSource, initial_ledger: CacheLedger, processor_pool: ActorHandle):
     """
     Given a list of ShardStatus objects and sources, creates an interleaving generator
     that reads from shards and tokenizes them in parallel.
@@ -1134,9 +1181,6 @@ def _make_interleave(name: str, source: ShardedDataSource, initial_ledger: Cache
 
     logger.warning(f"Starting cache build with {len(statuses)} shards, in {len(groups)} groups")
 
-    process_task = _mk_process_task(processor)
-    processor_ref = ray.put(processor)
-
     def _make_generator_fn(group: _ShardGroup):
         def generator():
             pylogging.basicConfig(level=DEFAULT_LOG_LEVEL, format=LOG_FORMAT)
@@ -1144,7 +1188,8 @@ def generator():
                 match message:
                     case _Batch():
                         # processed = ray.put(process_task(ray.get(message.payload)))
-                        processed = process_task.remote(processor_ref, message.payload)
+                        # processed = process_task.remote(processor_ref, message.payload)
+                        processed = processor_pool.process_batch.remote(RefBox(message.payload))
                         yield dataclasses.replace(message, payload=processed)
                     case _ShardFinished():
                         yield message
@@ -1156,7 +1201,9 @@ def generator():
     generator_fns = [_make_generator_fn(group) for group in groups]
 
     readers = [
-        RayPrefetchQueue(fn, 128, producer_options=dict(name=name, scheduling_strategy="SPREAD"))
+        RayPrefetchQueue(
+            fn, options.prefetch_per_group, producer_options=dict(num_cpus=0, name=name, scheduling_strategy="SPREAD")
+        )
         for name, fn in zip(group_names, generator_fns)
     ]
 
@@ -1169,6 +1216,20 @@ def generator():
     yield from _interleave_shards(readers, first_group_to_start)
 
 
+def _mk_processor_pool(split, processor, min_size, max_size):
+    import hashlib
+
+    metadata_hash = hashlib.md5(str(processor.metadata).encode()).hexdigest()
+    processor_pool_name = f"processor_pool::{metadata_hash}"
+    processor_pool = BatchProcessorPool.options(  # type: ignore
+        name=processor_pool_name, get_if_exists=True, lifetime="detached"
+    ).remote(  # type: ignore
+        processor, min_size, max_size
+    )
+
+    return processor_pool
+
+
 def _check_current_shard_progress(statuses):
     unfinished_shards: list[_ShardStatus] = []
     shards_with_progress: dict[str, int] = {}
@@ -1237,32 +1298,6 @@ def _shard_reader_generator(
         yield _ShardFinished(status.shard_name, row_idx)
 
 
-def _mk_process_task(processor: BatchProcessor[T, U]) -> RemoteFunction:
-    """
-    Returns a Ray remote function that processes a batch of data. Basically it takes the resources from
-    the processor and wraps its call
-    """
-    # processor_ref = ray.put(processor)
-    # exemplar = {
-    #     "input_ids": np.random.randint(0, 100, size=(4096,))
-    # }
-
-    @ray.remote(num_cpus=processor.num_cpus, num_gpus=processor.num_gpus, resources=processor.resources)
-    def process_task(processor, batch_payload):
-        try:
-            result = processor(batch_payload)  # TIME: 0.03 seconds
-            result = _canonicalize_batch(result)  # type: ignore
-            logger.debug("Finished processing batch")
-            return result
-        except Exception as e:
-            logger.exception("Error while processing batch")
-            raise e
-        finally:
-            pass
-
-    return process_task
-
-
 def _canonicalize_batch(batch: Union[dict, List[dict]]) -> List[dict]:
     if isinstance(batch, pa.RecordBatch):
         batch = dict_from_record_batch(batch)
diff --git a/src/levanter/store/jagged_array.py b/src/levanter/store/jagged_array.py
index b236641c9..d6674774f 100644
--- a/src/levanter/store/jagged_array.py
+++ b/src/levanter/store/jagged_array.py
@@ -4,7 +4,6 @@
 from typing import Optional, Sequence
 
 import fsspec.core
-import jax
 import jax.experimental.array_serialization.serialization as ser
 import jax.numpy as jnp
 import numpy as np
@@ -20,6 +19,60 @@
 DEFAULT_WRITE_CHUNK_SIZE = DEFAULT_CHUNK_SIZE * 512
 
 
+@dataclass
+class PreparedBatch:
+    """
+    A batch of data that has been prepared for storage in a jagged array.
+    """
+
+    data: np.ndarray
+    offsets: np.ndarray
+    shapes: Optional[np.ndarray]
+
+    def astype(self, dtype):
+        return PreparedBatch(self.data.astype(dtype), self.offsets, self.shapes)
+
+    @property
+    def num_rows(self):
+        return len(self.offsets)
+
+    @staticmethod
+    def from_batch(items: Sequence[np.ndarray], item_rank: Optional[int] = None) -> "PreparedBatch":
+        data, offsets, shapes = _prepare_batch(items, item_rank)
+        return PreparedBatch(data, offsets, shapes)
+
+    @staticmethod
+    def concat(batches: Sequence["PreparedBatch"]) -> "PreparedBatch":
+        data = np.concatenate([batch.data for batch in batches])
+        shapes = np.concatenate([batch.shapes for batch in batches]) if batches[0].shapes is not None else None
+        # offsets have to be adjusted by adding the previous offset
+        totals = np.cumsum([0] + [batch.data.size for batch in batches])
+        offsets = np.concatenate([batch.offsets + total for batch, total in zip(batches, totals)])
+
+        return PreparedBatch(data, offsets, shapes)
+
+
+def _prepare_batch(arrays, item_rank):
+    if item_rank is None:
+        item_rank = arrays[0].ndim
+
+    if item_rank != 1:
+        shapes = np.array([data.shape[:-1] for data in arrays], dtype=np.int64)
+    else:
+
+        shapes = None
+
+    # check shapes
+    for data in arrays:
+        if data.ndim != item_rank:
+            raise ValueError(f"Expected data to have rank {item_rank}, but got {data.ndim}")
+
+    offsets = np.array([data.size for data in arrays], dtype=np.int64)
+    offsets = np.cumsum(offsets)
+    data = np.concatenate([data.reshape(-1) for data in arrays])
+    return data, offsets, shapes
+
+
 @dataclass
 class JaggedArrayStore:
     """
@@ -113,10 +166,10 @@ async def data_size_async(self):
             self._cached_data_size = result
         return result
 
-    async def append_async(self, data: jax.Array):
+    async def append_async(self, data: np.ndarray):
         await self.extend_async([data])
 
-    def append(self, data: jax.Array):
+    def append(self, data: np.ndarray):
         self.extend([data])
 
     async def trim_to_size_async(self, size: int):
@@ -190,13 +243,21 @@ def trim_to_size(self, size: int):
             self._cached_num_rows = size
             self._cached_data_size = new_max
 
-    async def extend_async(self, arrays: Sequence[jax.Array]):
-        data, new_offsets, shapes = self._prepare_batch(arrays)
+    async def extend_async(self, arrays: Sequence[np.ndarray] | PreparedBatch):
+        if isinstance(arrays, PreparedBatch):
+            prepared = arrays
+        else:
+            prepared = PreparedBatch.from_batch(arrays, self.item_rank)
+        data = prepared.data
+        new_offsets = prepared.offsets
+        shapes = prepared.shapes
 
         num_rows = await self.num_rows_async()
-        num_added = len(arrays)
+        num_added = len(new_offsets)
         current_data_size = self.data_size
 
+        new_offsets = new_offsets + current_data_size
+
         # Write to resized arrays concurrently, adjusting offsets explicitly
         write_tasks = [
             self.data[current_data_size : current_data_size + len(data)].write(data),
@@ -207,19 +268,33 @@ async def extend_async(self, arrays: Sequence[jax.Array]):
         await asyncio.gather(*write_tasks)
 
         # Update num_rows
-        await self.offsets[0].write(num_rows + len(arrays))
+        await self.offsets[0].write(num_rows + num_added)
 
         if self._cache_metadata:
-            self._cached_num_rows = num_rows + len(arrays)
+            self._cached_num_rows = num_rows + num_added
             self._cached_data_size = current_data_size + len(data)
 
-    def extend(self, arrays: Sequence[jax.Array]):
-        data, new_offsets, shapes = self._prepare_batch(arrays)
+    def extend(self, arrays: Sequence[np.ndarray] | PreparedBatch):
+        if isinstance(arrays, PreparedBatch):
+            prepared = arrays
+        else:
+            prepared = PreparedBatch.from_batch(arrays, self.item_rank)
+
+        data = prepared.data
+        new_offsets = prepared.offsets
+        shapes = prepared.shapes
+
+        if shapes is None and self.item_rank != 1:
+            raise ValueError("Shapes must be provided for non-vector data")
+        elif shapes is not None and shapes.shape[1] != self.item_rank - 1:
+            raise ValueError(f"Shapes must have {self.item_rank-1} dimensions, but got {shapes.shape[1]}")
 
         num_rows = self.num_rows
-        num_added = len(arrays)
+        num_added = len(new_offsets)
         current_data_size = self.data_size
 
+        new_offsets = new_offsets + current_data_size
+
         write_tasks = [
             self.data[current_data_size : current_data_size + len(data)].write(data),
             self.offsets[num_rows + 1 : num_rows + num_added + 1].write(new_offsets),
@@ -231,28 +306,12 @@ def extend(self, arrays: Sequence[jax.Array]):
         for task in write_tasks:
             task.result()
 
-        self.offsets[0].write(num_rows + len(arrays)).result()
+        self.offsets[0].write(num_rows + num_added).result()
 
         if self._cache_metadata:
-            self._cached_num_rows = num_rows + len(arrays)
+            self._cached_num_rows = num_rows + num_added
             self._cached_data_size = current_data_size + len(data)
 
-    def _prepare_batch(self, arrays):
-        if self.shapes is not None:
-            for data in arrays:
-                if data.ndim != self.item_rank:
-                    raise ValueError(f"Expected data to have rank {self.item_rank}, got {data.ndim}")
-            shapes = np.array([data.shape[:-1] for data in arrays], dtype=np.int64)
-        else:
-            for data in arrays:
-                if data.ndim > 1:
-                    raise ValueError(f"Expected data to have rank 1, got {data.ndim}")
-            shapes = None
-        new_offsets = np.array([data.size for data in arrays], dtype=np.int64)
-        new_offsets = np.cumsum(new_offsets) + self.data_size
-        data = np.concatenate([data.reshape(-1) for data in arrays])
-        return data, new_offsets, shapes
-
     async def reload_async(self) -> "JaggedArrayStore":
         """
         Calls `resolve` on the underlying tensorstore objects, updating size information
@@ -309,7 +368,7 @@ async def get_item_async(self, item):
                 else:
                     raise e
 
-    async def get_batch(self, indices: Sequence[int]) -> Sequence[jax.Array]:
+    async def get_batch(self, indices: Sequence[int]) -> Sequence[np.ndarray]:
         # get indices
         with ts.Batch():
             all_indices_futs = [self._bounds_for_rows_async(indices[i], indices[i] + 1) for i in range(len(indices))]
@@ -334,7 +393,7 @@ async def get_batch(self, indices: Sequence[int]) -> Sequence[jax.Array]:
 
         return data
 
-    def get_batch_sync(self, indices: Sequence[int]) -> Sequence[jax.Array]:
+    def get_batch_sync(self, indices: Sequence[int]) -> Sequence[np.ndarray]:
         all_indices = self._bounds_for_rows_batch(indices)
 
         with ts.Batch():
diff --git a/src/levanter/store/tree_store.py b/src/levanter/store/tree_store.py
index cd29e5a4c..03355a8d2 100644
--- a/src/levanter/store/tree_store.py
+++ b/src/levanter/store/tree_store.py
@@ -10,7 +10,7 @@
 
 from haliax.jax_utils import is_jax_array_like
 
-from .jagged_array import JaggedArrayStore
+from .jagged_array import JaggedArrayStore, PreparedBatch
 
 
 T = TypeVar("T", bound=PyTree)
@@ -49,6 +49,10 @@ def __init__(self, tree, path: str, mode: str):
         self.mode = mode
         self.tree = tree
 
+    @property
+    def batch_preparer(self):
+        return TreeBatchPreparer(jtu.tree_map(lambda writer: 9, self.tree, is_leaf=heuristic_is_leaf))
+
     @staticmethod
     def open(exemplar: T, path: str, *, mode="a", cache_metadata: bool = False) -> "TreeStore":
         """
@@ -64,7 +68,6 @@ def extend(self, batch: Sequence[T]):
         """
         Append a batch of data to the store.
         """
-        # TODO: I do wish zarr supported async
         jtu.tree_map(
             lambda writer, *xs: writer.extend([np.asarray(x) for x in xs]),
             self.tree,
@@ -80,7 +83,7 @@ def extend_with_batch(self, batch: T):
         For instance, HF's BatchEncoding is a dict of lists of numpy arrays.
         """
         jtu.tree_map(
-            lambda writer, xs: writer.extend([np.asarray(x) for x in xs]),
+            lambda writer, xs: writer.extend(xs if isinstance(xs, PreparedBatch) else [np.asarray(x) for x in xs]),
             self.tree,
             batch,
             is_leaf=heuristic_is_leaf_batched,
@@ -94,7 +97,9 @@ async def extend_with_batch_async(self, batch: T):
         For instance, HF's BatchEncoding is a dict of lists of numpy arrays.
         """
         futures = jtu.tree_map(
-            lambda writer, xs: writer.extend_async([np.asarray(x) for x in xs]),
+            lambda writer, xs: writer.extend_async(
+                xs if isinstance(xs, PreparedBatch) else [np.asarray(x) for x in xs]
+            ),
             self.tree,
             batch,
             is_leaf=heuristic_is_leaf_batched,
@@ -198,46 +203,14 @@ def _render_path_elem(x):
             return str(x)
 
 
-# class TokenSeqDataset:
-#     """
-#     A dataset of sequences of tokens of fixed length, materialized from a collection of JaggedArrayStores,
-#     which have typically much longer sequences. This class takes consecutive sequences of tokens from the builders
-#     and slices/concats them to form the dataset.
-#     """
-#
-#     def __init__(
-#         self, token_arrays: Sequence[JaggedArrayStore], token_counts: Sequence[int], seq_len: int, pad_token: int
-#     ):
-#         self.token_arrays = token_arrays
-#
-#         def _round_to_nearest_multiple(x, y):
-#             return x + y - x % y
-#
-#         token_counts_padded = np.array([_round_to_nearest_multiple(x, seq_len) for x in token_counts])
-#         seq_counts = token_counts_padded // seq_len
-#         self.seq_counts_cumsum = np.concatenate([np.asarray([0]), np.cumsum(seq_counts)])
-#
-#         self.seq_len = seq_len
-#         self.pad_token = pad_token
-#
-#     def __len__(self):
-#         return self.seq_counts_cumsum[-1]
-#
-#     def __getitem__(self, seq_id):
-#         return asyncio.run(self.get_item_async(seq_id))
-#
-#     async def get_item_async(self, seq_id):
-#         # TODO: accept slices and such?
-#         shard_id = np.searchsorted(self.seq_counts_cumsum, seq_id, side="right") - 1
-#         shard_start = self.seq_counts_cumsum[shard_id]
-#         shard_end = self.seq_counts_cumsum[shard_id + 1]
-#         shard_seq_id = seq_id - shard_start
-#
-#         shard_seq_start = shard_seq_id * self.seq_len
-#         shard_seq_end = min((shard_seq_id + 1) * self.seq_len, self.token_arrays[shard_id].data_size)
-#
-#         shard_seq = await self.token_arrays[shard_id].data[shard_seq_start:shard_seq_end].read()
-#         pad_len = self.seq_len - (shard_seq_end - shard_seq_start)
-#         padded_seq = np.concatenate([shard_seq, np.full(pad_len, self.pad_token, dtype=shard_seq.dtype)])
-#
-#         return padded_seq
+class TreeBatchPreparer(Generic[T]):
+    def __init__(self, exemplar: T):
+        self.exemplar = exemplar
+
+    def __call__(self, batch: List[T]) -> PyTree:
+        return jtu.tree_map(
+            lambda _, *xs: PreparedBatch.from_batch([np.asarray(x) for x in xs]),
+            self.exemplar,
+            *batch,
+            is_leaf=heuristic_is_leaf,
+        )
diff --git a/src/levanter/utils/actor_pool.py b/src/levanter/utils/actor_pool.py
new file mode 100644
index 000000000..51ba2ccec
--- /dev/null
+++ b/src/levanter/utils/actor_pool.py
@@ -0,0 +1,224 @@
+import asyncio
+import logging
+from abc import ABC
+from typing import Any, Callable, Dict, List, Optional, TypeVar
+
+import ray
+
+
+V = TypeVar("V")
+R = TypeVar("R")
+
+logger = logging.getLogger(__name__)
+
+# Copilot-Adapted from:
+# https://github.com/ray-project/ray/blob/1bab09bf842edee51c3778be4cfb16f8b900d764/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py
+
+
+class AutoScalingActorPool:
+    """Utility class to operate on a dynamically scaling pool of actors."""
+
+    def __init__(
+        self,
+        create_actor_fn: Callable[[], "ray.actor.ActorHandle"],
+        min_size: int = 1,
+        max_size: int = 10,
+    ):
+        if max_size < min_size:
+            raise ValueError("max_size must be greater than or equal to min_size.")
+        self._create_actor_fn = create_actor_fn
+        self._min_size = min_size
+        self._max_size = max_size
+
+        self._idle_actors: List[ray.actor.ActorHandle] = []
+        self._busy_actors: Dict[ray.ObjectRef, ray.actor.ActorHandle] = {}
+        self._pending_actors: Dict[ray.ObjectRef, ray.actor.ActorHandle] = {}
+
+        self._actor_locations: Dict[ray.actor.ActorHandle, str] = {}
+        self._tasks_waiting_for_actor: list[asyncio.Future] = []
+        self._next_task_id = 0
+
+        self._scale_up(self._min_size)
+
+    @property
+    def num_pending_tasks(self):
+        return len(self._tasks_waiting_for_actor)
+
+    def _scale_up(self, num_actors: int):
+        for _ in range(num_actors):
+            try:
+                actor = self._create_actor_fn()
+                ready_ref = actor.get_location.remote()
+                self._pending_actors[ready_ref] = actor
+
+                async def wait_for_ready(actor, ready_ref):
+                    loc = await ready_ref
+                    # pending -> floating
+                    if ready_ref not in self._pending_actors:
+                        logger.info("Actor was cancelled before it was ready.")
+                        return
+                    del self._pending_actors[ready_ref]
+                    self._assert_is_floating(actor)
+                    self._actor_locations[actor] = loc
+                    self._maybe_start_pending_task(actor)  # floating -> {idle, busy}
+
+                asyncio.ensure_future(wait_for_ready(actor, ready_ref))
+
+            except Exception as e:
+                logger.error("Failed to create actor.", exc_info=e)
+
+    def _scale_down(self, num_actors: int):
+        for _ in range(num_actors):
+            if self._pending_actors:
+                actor = self._pending_actors.popitem()[1]
+                # let it die through gc
+                # ray.kill(actor)
+            elif self._idle_actors:
+                actor = self._idle_actors.pop()
+                del self._actor_locations[actor]
+                # let it die through gc
+                # ray.kill(actor)
+            else:
+                break
+
+    def _adjust_pool_size(self):
+        num_pending_tasks = self.num_pending_tasks
+        num_idle_actors = len(self._idle_actors)
+        num_busy_actors = len(self._busy_actors)
+        num_pending_actors = len(self._pending_actors)
+
+        num_nonworking_actors = num_idle_actors + num_pending_actors
+        total_actors = num_nonworking_actors + num_busy_actors
+
+        # TODO: better autoscale logic
+        if (
+            num_pending_actors == 0
+            and num_pending_tasks > 0
+            and num_idle_actors == 0
+            and total_actors < self._max_size
+        ):
+            logger.info(
+                f"Scaling up due to {num_pending_tasks} pending tasks. Current pool size: {total_actors}. Max size:"
+                f" {self._max_size}"
+            )
+            self._scale_up(min(self._max_size - num_busy_actors, num_pending_tasks))
+        elif num_pending_tasks == 0 and num_nonworking_actors > self._min_size:
+            return  # never scal edown. too many issues
+            logger.info(f"Scaling down due to no pending tasks. Current pool size: {total_actors}")
+            self._scale_down(num_nonworking_actors - self._min_size)
+
+    def _get_object_location(self, obj_ref: ray.ObjectRef) -> Optional[str]:
+        """Get the location of the given object reference."""
+        try:
+            locs = ray.experimental.get_object_locations([obj_ref])
+            nodes = locs[obj_ref]["node_ids"]
+            if nodes:
+                return nodes[0]
+        except Exception as e:
+            logger.error(f"Failed to get object location: {e}")
+        return None
+
+    def _pick_actor(self, obj_ref: Optional[ray.ObjectRef] = None) -> Optional[ray.actor.ActorHandle]:
+        """Pick an actor based on locality and busyness."""
+        # idle -> floating
+        if not self._idle_actors:
+            return None
+
+        if obj_ref:
+            preferred_loc = self._get_object_location(obj_ref)
+        else:
+            preferred_loc = None
+
+        def penalty_key(actor):
+            """Returns the key that should be minimized for the best actor."""
+            requires_remote_fetch = self._actor_locations[actor] != preferred_loc
+            return requires_remote_fetch
+
+        actor = min(self._idle_actors, key=penalty_key)
+        actor = self._idle_actors.pop(self._idle_actors.index(actor))
+        return actor
+
+    def submit(self, fn: Callable[["ray.actor.ActorHandle", V], R], value: V, obj_ref: Optional[ray.ObjectRef] = None):
+        actor = self._pick_actor(obj_ref)
+        if actor:
+            return self._assign_task_to_actor(actor, fn, value)
+        else:
+            actor_future: asyncio.Future = asyncio.Future()
+            self._tasks_waiting_for_actor.append(actor_future)
+            f = asyncio.ensure_future(self._enqueue_pending_task(fn, obj_ref, value, actor_future))
+            self._adjust_pool_size()
+            return f
+
+    def _assign_task_to_actor(self, actor, fn, value):
+        # floating -> busy
+        ray_future = fn(actor, value)
+        self._busy_actors[ray_future] = actor
+        self._adjust_pool_size()
+
+        # return ray_future
+        return asyncio.ensure_future(self._wrap_ray_future(ray_future))
+
+    async def _enqueue_pending_task(self, fn, obj_ref, value, actor_future):
+        actor = await actor_future
+        return await self._assign_task_to_actor(actor, fn, value)
+
+    def _assert_is_floating(self, actor):
+        assert actor not in self._idle_actors
+        assert actor not in self._busy_actors
+        assert actor not in self._pending_actors
+
+    def _maybe_start_pending_task(self, actor):
+        self._assert_is_floating(actor)
+        if self._tasks_waiting_for_actor:
+            # floating -> busy (inside the _enqueue_pending_task coroutine)
+            actor_future = self._tasks_waiting_for_actor.pop(0)
+            actor_future.set_result(actor)
+            assigned = True
+        else:
+            # floating -> idle
+            self._idle_actors.append(actor)
+            self._adjust_pool_size()
+            assigned = False
+        return assigned
+
+    async def _wrap_ray_future(self, ray_future):
+        await asyncio.wait([ray_future])
+        self._on_task_done(ray_future)
+        return await ray_future
+
+    def _on_task_done(self, ray_future):
+        actor = self._busy_actors.pop(ray_future)
+        self._maybe_start_pending_task(actor)
+
+    async def map(
+        self,
+        fn: Callable[["ray.actor.ActorHandle", V], Any],
+        values: List[V],
+        obj_refs: Optional[List[Optional[ray.ObjectRef]]] = None,
+    ) -> List[Any]:
+        if obj_refs is None:
+            obj_refs = [None] * len(values)
+
+        tasks = [self.submit(fn, v, obj_ref) for v, obj_ref in zip(values, obj_refs)]
+        return await asyncio.gather(*tasks)
+
+    def has_free(self):
+        return bool(self._idle_actors)
+
+    def has_free_or_pending_actors(self):
+        return bool(self._idle_actors) or bool(self._pending_actors)
+
+    def pop_idle(self):
+        if self._idle_actors:
+            return self._idle_actors.pop()
+        return None
+
+    def push(self, actor: "ray.actor.ActorHandle"):
+        location = ray.get(actor.get_location.remote())
+        self._actor_locations[actor] = location
+        self._maybe_start_pending_task(actor)
+
+
+class PoolWorkerBase(ABC):
+    def get_location(self) -> str:
+        return ray.get_runtime_context().get_node_id()
diff --git a/src/levanter/utils/thread_utils.py b/src/levanter/utils/thread_utils.py
index fad60ad31..401ac94c5 100644
--- a/src/levanter/utils/thread_utils.py
+++ b/src/levanter/utils/thread_utils.py
@@ -72,3 +72,27 @@ def close(self):
             self.loop.call_soon_threadsafe(self.loop.stop)
         self.thread.join()
         self.loop.close()
+
+
+class ExceptionTrackingThread(threading.Thread):
+    """A thread that will store exceptions that occur in the target function and
+    re-raise them in the main thread."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._exception = None
+
+    def run(self):
+        try:
+            super().run()
+        except Exception as e:
+            self._exception = e
+
+    def join(self, *args, **kwargs):
+        super().join(*args, **kwargs)
+        if self._exception:
+            raise self._exception
+
+    def check_raise(self):
+        if self._exception:
+            raise self._exception
diff --git a/tests/test_actor_pool.py b/tests/test_actor_pool.py
new file mode 100644
index 000000000..08686eb30
--- /dev/null
+++ b/tests/test_actor_pool.py
@@ -0,0 +1,167 @@
+import asyncio
+import time
+
+import pytest
+import ray
+
+from levanter.utils.actor_pool import AutoScalingActorPool, PoolWorkerBase
+from levanter.utils.py_utils import logical_cpu_core_count
+
+
+@ray.remote
+class TestActor(PoolWorkerBase):
+    def __init__(self):
+        self.node_id = ray.get_runtime_context().get_node_id()
+
+    def get_node_id(self):
+        return self.node_id
+
+    def double(self, v):
+        return 2 * v
+
+
+@ray.remote
+class BlockerActor(PoolWorkerBase):
+    def __init__(self):
+        self.node_id = ray.get_runtime_context().get_node_id()
+        self.unblocked = False
+        self.unblock_event = asyncio.Event()
+
+    def get_node_id(self):
+        return self.node_id
+
+    async def block(self):
+        if not self.unblocked:
+            await self.unblock_event.wait()
+
+    async def unblock(self):
+        self.unblocked = True
+        self.unblock_event.set()
+
+
+@ray.remote
+class BlockingTestActor(PoolWorkerBase):
+    def __init__(self, blocker):
+        self.node_id = ray.get_runtime_context().get_node_id()
+        self.blocker = blocker
+
+    def get_node_id(self):
+        return self.node_id
+
+    def double(self, v, bypass_blocker=False):
+        if not bypass_blocker:
+            ray.get(self.blocker.block.remote())
+        return 2 * v
+
+
+# Helper function to create a TestActor
+def create_test_actor():
+    return TestActor.remote()
+
+
+def create_test_actor_blocker(blocker_handle):
+    return BlockingTestActor.remote(blocker_handle)
+
+
+def setup_module(module):
+    ray.init(
+        "local", num_cpus=max(2 * logical_cpu_core_count(), 8), ignore_reinit_error=True
+    )  # 2x cpu count is faster on my m1
+
+
+def teardown_module(module):
+    ray.shutdown()
+
+
+@pytest.mark.asyncio
+async def test_basic_submit():
+    pool = AutoScalingActorPool(create_test_actor, min_size=1, max_size=4)
+    results = [pool.submit(lambda a, v: a.double.remote(v), i) for i in range(4)]
+    results = [await r for r in results]
+
+    assert results == [0, 2, 4, 6]
+
+
+@pytest.mark.asyncio
+async def test_basic_submit_no_idle():
+    pool = AutoScalingActorPool(create_test_actor, min_size=0, max_size=4)
+    results = [pool.submit(lambda a, v: a.double.remote(v), i) for i in range(4)]
+    results = [await r for r in results]
+
+    assert results == [0, 2, 4, 6]
+
+
+@pytest.mark.asyncio
+async def test_basic_functionality():
+    pool = AutoScalingActorPool(create_test_actor, min_size=1, max_size=4)
+    results = list(await pool.map(lambda a, v: a.double.remote(v), [1, 2, 3, 4]))
+    assert results == [2, 4, 6, 8]
+
+
+@pytest.mark.asyncio
+async def test_scaling_up():
+    blocker = BlockerActor.remote()
+    pool = AutoScalingActorPool(lambda: create_test_actor_blocker(blocker), min_size=1, max_size=4)
+    f1 = pool.submit(lambda a, v: a.double.remote(v), 1)
+    f2 = pool.submit(lambda a, v: a.double.remote(v), 2)
+    f3 = pool.submit(lambda a, v: a.double.remote(v, True), 3)
+    f4 = pool.submit(lambda a, v: a.double.remote(v, True), 4)
+
+    shield_f2 = asyncio.shield(f2)
+    with pytest.raises(asyncio.TimeoutError):
+        await asyncio.wait_for(shield_f2, timeout=0.1)
+
+    assert (await asyncio.gather(f3, f4)) == [6, 8]
+
+    await blocker.unblock.remote()
+    # assert (await asyncio.gather(f1, f2)) == [2, 4]
+    assert (await f1) == 2
+    assert (await f2) == 4
+
+
+@pytest.mark.asyncio
+async def test_scaling_down():
+    pool = AutoScalingActorPool(create_test_actor, min_size=1, max_size=4)
+    await pool.submit(lambda a, v: a.double.remote(v), 1)
+    await pool.submit(lambda a, v: a.double.remote(v), 2)
+    await pool.submit(lambda a, v: a.double.remote(v), 3)
+    await pool.submit(lambda a, v: a.double.remote(v), 4)
+    results = await asyncio.gather(
+        pool.submit(lambda a, v: a.double.remote(v), 1),
+        pool.submit(lambda a, v: a.double.remote(v), 2),
+        pool.submit(lambda a, v: a.double.remote(v), 3),
+        pool.submit(lambda a, v: a.double.remote(v), 4),
+    )
+    assert results == [2, 4, 6, 8]
+    assert len(pool._idle_actors) == 1
+    assert len(pool._busy_actors) == 0
+
+
+@pytest.mark.asyncio
+async def test_push_pop_idle():
+    pool = AutoScalingActorPool(create_test_actor, min_size=1, max_size=4)
+    await pool.submit(lambda a, v: a.double.remote(v), 1)
+    actor = pool.pop_idle()
+    assert actor is not None
+    pool.push(actor)
+    assert len(pool._idle_actors) == 1
+
+
+@pytest.mark.asyncio
+async def test_submit_with_no_idle_actors():
+    blocker = BlockerActor.remote()
+    pool = AutoScalingActorPool(lambda: create_test_actor_blocker(blocker), min_size=1, max_size=4)
+    futs = [pool.submit(lambda a, v: a.double.remote(v), i) for i in range(4)]
+    f5 = pool.submit(lambda a, v: a.double.remote(v), 5)
+    await _sleep_until(lambda: pool.num_pending_tasks == 1, timeout=10)
+    await blocker.unblock.remote()
+    await asyncio.gather(*futs)
+    assert (await f5) == 10
+
+
+async def _sleep_until(condition, timeout=5, message="Condition not met within timeout"):
+    start = time.time()
+    while not condition():
+        if time.time() - start > timeout:
+            pytest.fail(message)
+        await asyncio.sleep(0.1)
diff --git a/tests/test_jagged_array.py b/tests/test_jagged_array.py
index c89a2c625..4a450bae7 100644
--- a/tests/test_jagged_array.py
+++ b/tests/test_jagged_array.py
@@ -6,7 +6,7 @@
 import numpy as np
 import pytest
 
-from levanter.store.jagged_array import JaggedArrayStore
+from levanter.store.jagged_array import JaggedArrayStore, PreparedBatch
 
 
 class TestJaggedArrayStore:
@@ -50,6 +50,75 @@ def test_extend_with_multiple(self, cache_metadata):
             result2 = builder[1]
             assert jnp.all(result2 == data2)
 
+    @pytest.mark.parametrize("cache_metadata", [True, False])
+    def test_extend_with_prepared_batch(self, cache_metadata):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            builder = JaggedArrayStore.open(tmpdir, item_rank=2, dtype=jnp.float32, cache_metadata=cache_metadata)
+
+            data1 = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=jnp.float32)
+            data2 = np.array([[5.0]], dtype=jnp.float32)
+            prepared = PreparedBatch.from_batch([data1, data2])
+
+            builder.extend(prepared)
+
+            assert len(builder) == 2
+
+            result1 = builder[0]
+            assert jnp.all(result1 == data1)
+
+            result2 = builder[1]
+            assert jnp.all(result2 == data2)
+
+            # extendd with more data
+            data3 = jnp.array([[6.0, 7.0], [8.0, 9.0]])
+            data4 = jnp.array([[10.0]])
+            prepared2 = PreparedBatch.from_batch([data3, data4])
+
+            builder.extend(prepared2)
+
+            assert len(builder) == 4
+
+            result3 = builder[2]
+            assert jnp.all(result3 == data3)
+
+            result4 = builder[3]
+            assert jnp.all(result4 == data4)
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("cache_metadata", [True, False])
+    async def test_extend_with_prepared_batch_async(self, cache_metadata):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            builder = JaggedArrayStore.open(tmpdir, item_rank=2, dtype=jnp.float32, cache_metadata=cache_metadata)
+
+            data1 = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=jnp.float32)
+            data2 = np.array([[5.0]], dtype=jnp.float32)
+            prepared = PreparedBatch.from_batch([data1, data2])
+
+            await builder.extend_async(prepared)
+
+            assert len(builder) == 2
+
+            result1 = builder[0]
+            assert jnp.all(result1 == data1)
+
+            result2 = builder[1]
+            assert jnp.all(result2 == data2)
+
+            # extendd with more data
+            data3 = jnp.array([[6.0, 7.0], [8.0, 9.0]])
+            data4 = jnp.array([[10.0]])
+            prepared2 = PreparedBatch.from_batch([data3, data4])
+
+            await builder.extend_async(prepared2)
+
+            assert len(builder) == 4
+
+            result3 = builder[2]
+            assert jnp.all(result3 == data3)
+
+            result4 = builder[3]
+            assert jnp.all(result4 == data4)
+
     def test_append_error(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             builder = JaggedArrayStore.open(tmpdir, item_rank=1, dtype=jnp.float32)
diff --git a/tests/test_new_cache.py b/tests/test_new_cache.py
index af6fa885f..275c6a236 100644
--- a/tests/test_new_cache.py
+++ b/tests/test_new_cache.py
@@ -1,6 +1,5 @@
 import asyncio
 import copy
-import logging
 import os
 import tempfile
 from typing import Any, Dict, Iterator, Sequence
@@ -23,7 +22,6 @@
     build_or_load_cache,
 )
 from levanter.utils.py_utils import logical_cpu_core_count
-from levanter.utils.ray_utils import ExceptionInfo, SnitchRecipient
 
 
 class TestProcessor(BatchProcessor[Sequence[int], dict[str, np.ndarray]]):
@@ -135,62 +133,6 @@ def test_serial_cache_writer():
             np.testing.assert_array_equal(x["data"], np.asarray([i % 10 + i // 10 * 10] * 10))
 
 
-def crappy_du(path):
-    import os
-
-    total = 0
-    for root, dirs, files in os.walk(path):
-        for f in files:
-            total += os.path.getsize(os.path.join(root, f))
-    return total
-
-
-@ray.remote
-class PretendParent(SnitchRecipient):
-    def __init__(self):
-        self.logger = logging.getLogger("SnitchRecipient")
-        self.failure_received = asyncio.Event()
-        self.exception_info = None
-        self._finished_shards = set()
-        self._finished = False
-        self._ledger = None
-        self._desired_next_item = None
-
-    def _child_failed(self, child: ray.actor.ActorHandle, exception: ExceptionInfo):
-        try:
-            self.logger.error(f"Child {child} failed with exception {exception}")
-            self.exception_info = exception
-            self.failure_received.set()
-        except Exception as e:
-            self.logger.error(f"Error in _child_failed: {e}")
-
-    def shard_failed(self, shard_name, exc_info):
-        self.exception_info = exc_info
-        self.failure_received.set()
-
-    async def wait_for_failure(self):
-        await self.failure_received.wait()
-        return self.exception_info
-
-    def shard_finished(self, shard_name):
-        self._finished_shards.add(shard_name)
-
-    def get_finished_shards(self):
-        return self._finished_shards
-
-    def _notify_updated_ledger(self, ledger):
-        if ledger.is_finished:
-            self._finished = True
-
-        self._ledger = ledger
-
-    def _finalize(self):
-        self._finished = True
-
-    def is_finished(self):
-        return self._finished
-
-
 @pytest.mark.ray
 def test_full_end_to_end_cache():
     td = tempfile.TemporaryDirectory()
diff --git a/tests/test_tree_store.py b/tests/test_tree_store.py
index 66131ca48..a0d089576 100644
--- a/tests/test_tree_store.py
+++ b/tests/test_tree_store.py
@@ -254,6 +254,34 @@ def test_reading_from_written():
                 pytest.fail("Unexpected index")
 
 
+def test_using_prepared_batches():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        exemplar = {"a": np.array([0], dtype=np.float64), "b": np.array([0], dtype=np.float64)}
+        builder = TreeStore.open(exemplar, tmpdir, mode="w")
+        preparer = builder.batch_preparer
+
+        batch = [
+            {"a": np.array([1.0, 2.0]), "b": np.array([3.0, 4.0])},
+            {"a": np.array([5.0, 6.0]), "b": np.array([7.0, 8.0])},
+        ]
+        batch = preparer(batch)
+        builder.extend_with_batch(batch)
+
+        del builder
+
+        builder2 = TreeStore.open(exemplar, tmpdir, mode="r")
+
+        for i, result in enumerate(builder2):
+            if i == 0:
+                assert np.all(result["a"] == np.array([1.0, 2.0]))
+                assert np.all(result["b"] == np.array([3.0, 4.0]))
+            elif i == 1:
+                assert np.all(result["a"] == np.array([5.0, 6.0]))
+                assert np.all(result["b"] == np.array([7.0, 8.0]))
+            else:
+                pytest.fail("Unexpected index")
+
+
 def test_resolve_changed_cache_size():
     with tempfile.TemporaryDirectory() as tmpdir:
         exemplar = {"a": np.array([0], dtype=np.float64), "b": np.array([0], dtype=np.float64)}

From 36459dabe1a7c9a7ce962d6c63e7de6f07daae59 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Thu, 10 Oct 2024 09:05:49 -0700
Subject: [PATCH 83/94] pre-commit

---
 src/levanter/data/text.py          | 4 +---
 src/levanter/utils/fsspec_utils.py | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
index f2a3b8497..a1e20384f 100644
--- a/src/levanter/data/text.py
+++ b/src/levanter/data/text.py
@@ -10,10 +10,8 @@
 from itertools import chain
 from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple, TypeVar, Union
 
-
 import datasets
 import equinox as eqx
-import fsspec
 import jax
 import numpy as np
 import regex
@@ -37,8 +35,8 @@
 from levanter.store.cache import CacheOptions, TreeCache
 from levanter.store.jagged_array import JaggedArrayStore
 from levanter.store.tree_store import TreeStore
-from levanter.utils.hf_utils import num_cpus_used_by_tokenizer
 from levanter.utils.fsspec_utils import fsspec_expand_glob
+from levanter.utils.hf_utils import num_cpus_used_by_tokenizer
 
 
 silence_transformer_nag()  # noqa
diff --git a/src/levanter/utils/fsspec_utils.py b/src/levanter/utils/fsspec_utils.py
index 452ab3d84..64870443d 100644
--- a/src/levanter/utils/fsspec_utils.py
+++ b/src/levanter/utils/fsspec_utils.py
@@ -1,5 +1,5 @@
-import fsspec
 import braceexpand
+import fsspec
 
 
 def exists(url, **kwargs) -> bool:

From 64996569ad726f5d8b4314586f7913cccfd9db92 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Thu, 10 Oct 2024 09:07:59 -0700
Subject: [PATCH 84/94] flaky hf

---
 tests/test_audio.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/tests/test_audio.py b/tests/test_audio.py
index 3ad9b09b3..33dda6034 100644
--- a/tests/test_audio.py
+++ b/tests/test_audio.py
@@ -12,12 +12,17 @@
 @skip_if_no_soundlibs
 @skip_if_hf_model_not_accessible("openai/whisper-tiny")
 def test_whisper_batch_processor():
-    processor = AutoProcessor.from_pretrained("openai/whisper-tiny")
-    tokenizer = AutoTokenizer.from_pretrained("openai/whisper-tiny")
-    ds = load_dataset("WillHeld/test_librispeech_parquet", split="validation").select_columns(["audio", "text"])
-    batch_processor = BatchAudioProcessor(processor, tokenizer)
-    inputs = [(audio["array"], audio["sampling_rate"], text) for audio, text in zip(ds[:16]["audio"], ds[:16]["text"])]
-    batch_processor(inputs)
+    try:
+        processor = AutoProcessor.from_pretrained("openai/whisper-tiny")
+        tokenizer = AutoTokenizer.from_pretrained("openai/whisper-tiny")
+        ds = load_dataset("WillHeld/test_librispeech_parquet", split="validation").select_columns(["audio", "text"])
+        batch_processor = BatchAudioProcessor(processor, tokenizer)
+        inputs = [
+            (audio["array"], audio["sampling_rate"], text) for audio, text in zip(ds[:16]["audio"], ds[:16]["text"])
+        ]
+        batch_processor(inputs)
+    except FileNotFoundError:
+        pytest.skip("No whisper model found. Probably HF is being flaky.")
 
 
 @skip_if_no_soundlibs

From 074477f437c42e577282331e0c65958003eb96d1 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Thu, 10 Oct 2024 18:57:31 -0500
Subject: [PATCH 85/94] Fix actor pool in python 3.11, add better scaling down
 logic (#760)

---
 config/data/openwebtext_source.yaml |  3 ++
 src/levanter/store/cache.py         |  3 --
 src/levanter/utils/actor_pool.py    | 48 ++++++++++++++++++++++-------
 3 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/config/data/openwebtext_source.yaml b/config/data/openwebtext_source.yaml
index 764ee0b9e..6daa695c0 100644
--- a/config/data/openwebtext_source.yaml
+++ b/config/data/openwebtext_source.yaml
@@ -4,3 +4,6 @@ validation_urls:
   - "gs://pubmed-mosaic/openwebtext-sharded/openwebtext_val.{1..8}-of-8.jsonl.gz"
 cache_dir: "gs://levanter-data/tokenized/openwebtext/"
 tokenizer: "gpt2"
+cache_options:
+  batch_size: 1024
+  num_shard_groups: 64
diff --git a/src/levanter/store/cache.py b/src/levanter/store/cache.py
index c0bda78f9..ee1969a03 100644
--- a/src/levanter/store/cache.py
+++ b/src/levanter/store/cache.py
@@ -1061,11 +1061,8 @@ def _write_batches(writer: ShardedCacheWriter, shard_totals, batches, finished_s
 
 
 def _fetch_batches(batches) -> tuple[dict[str, int], list[PreparedBatch]]:
-    time_in = time.time()
     shards_for_batches, payloads_for_batches = zip(*batches)
     payloads_for_batches = ray.get(list(payloads_for_batches))
-    time_out = time.time()
-    logger.info(f"Fetched {len(batches)} batches in {time_out - time_in} seconds")
 
     shard_row_totals: dict[str, int] = {}
     for shard, payload in zip(shards_for_batches, payloads_for_batches):
diff --git a/src/levanter/utils/actor_pool.py b/src/levanter/utils/actor_pool.py
index 51ba2ccec..76c3ca8fb 100644
--- a/src/levanter/utils/actor_pool.py
+++ b/src/levanter/utils/actor_pool.py
@@ -15,6 +15,11 @@
 # https://github.com/ray-project/ray/blob/1bab09bf842edee51c3778be4cfb16f8b900d764/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py
 
 
+def _wrap_ray_future(ray_future):
+    # work around https://github.com/ray-project/ray/issues/45895#issuecomment-2165164129
+    return asyncio.wrap_future(ray_future.future())
+
+
 class AutoScalingActorPool:
     """Utility class to operate on a dynamically scaling pool of actors."""
 
@@ -37,6 +42,7 @@ def __init__(
         self._actor_locations: Dict[ray.actor.ActorHandle, str] = {}
         self._tasks_waiting_for_actor: list[asyncio.Future] = []
         self._next_task_id = 0
+        self._scale_down_task: Optional[asyncio.Task] = None
 
         self._scale_up(self._min_size)
 
@@ -45,6 +51,9 @@ def num_pending_tasks(self):
         return len(self._tasks_waiting_for_actor)
 
     def _scale_up(self, num_actors: int):
+        if self._scale_down_task and not self._scale_down_task.done():
+            self._scale_down_task.cancel()
+
         for _ in range(num_actors):
             try:
                 actor = self._create_actor_fn()
@@ -52,7 +61,7 @@ def _scale_up(self, num_actors: int):
                 self._pending_actors[ready_ref] = actor
 
                 async def wait_for_ready(actor, ready_ref):
-                    loc = await ready_ref
+                    loc = await _wrap_ray_future(ready_ref)
                     # pending -> floating
                     if ready_ref not in self._pending_actors:
                         logger.info("Actor was cancelled before it was ready.")
@@ -67,8 +76,8 @@ async def wait_for_ready(actor, ready_ref):
             except Exception as e:
                 logger.error("Failed to create actor.", exc_info=e)
 
-    def _scale_down(self, num_actors: int):
-        for _ in range(num_actors):
+    def _scale_down(self, target_num_actors: int):
+        while len(self._idle_actors) + len(self._pending_actors) > target_num_actors:
             if self._pending_actors:
                 actor = self._pending_actors.popitem()[1]
                 # let it die through gc
@@ -102,10 +111,20 @@ def _adjust_pool_size(self):
                 f" {self._max_size}"
             )
             self._scale_up(min(self._max_size - num_busy_actors, num_pending_tasks))
+
+        # Schedule scale down if idle
         elif num_pending_tasks == 0 and num_nonworking_actors > self._min_size:
-            return  # never scal edown. too many issues
-            logger.info(f"Scaling down due to no pending tasks. Current pool size: {total_actors}")
-            self._scale_down(num_nonworking_actors - self._min_size)
+            if self._scale_down_task is None or self._scale_down_task.done():
+                self._scale_down_task = asyncio.create_task(self._schedule_scale_down())
+
+    async def _schedule_scale_down(self):
+        try:
+            await asyncio.sleep(10)
+            if self.num_pending_tasks == 0:
+                logger.info("Scaling down due to no pending tasks.")
+                self._scale_down(self._min_size)
+        except asyncio.CancelledError:
+            logger.info("Scale down task was cancelled due to new activity.")
 
     def _get_object_location(self, obj_ref: ray.ObjectRef) -> Optional[str]:
         """Get the location of the given object reference."""
@@ -153,10 +172,11 @@ def _assign_task_to_actor(self, actor, fn, value):
         # floating -> busy
         ray_future = fn(actor, value)
         self._busy_actors[ray_future] = actor
+        if self._scale_down_task and not self._scale_down_task.done():
+            self._scale_down_task.cancel()
         self._adjust_pool_size()
 
-        # return ray_future
-        return asyncio.ensure_future(self._wrap_ray_future(ray_future))
+        return asyncio.ensure_future(self._set_up_actor_return_on_finished(ray_future))
 
     async def _enqueue_pending_task(self, fn, obj_ref, value, actor_future):
         actor = await actor_future
@@ -181,10 +201,11 @@ def _maybe_start_pending_task(self, actor):
             assigned = False
         return assigned
 
-    async def _wrap_ray_future(self, ray_future):
-        await asyncio.wait([ray_future])
+    async def _set_up_actor_return_on_finished(self, ray_future):
+        future = _wrap_ray_future(ray_future)
+        await asyncio.wait([future])
         self._on_task_done(ray_future)
-        return await ray_future
+        return await future
 
     def _on_task_done(self, ray_future):
         actor = self._busy_actors.pop(ray_future)
@@ -218,6 +239,11 @@ def push(self, actor: "ray.actor.ActorHandle"):
         self._actor_locations[actor] = location
         self._maybe_start_pending_task(actor)
 
+    def __del__(self):
+        if self._scale_down_task and not self._scale_down_task.done():
+            self._scale_down_task.cancel()
+        # just let ray kill the actors naturally
+
 
 class PoolWorkerBase(ABC):
     def get_location(self) -> str:

From 1c0e10ec5cb700840ef3957fc15e6594bdd657ff Mon Sep 17 00:00:00 2001
From: Jason Wang <blahblahj.wsy@gmail.com>
Date: Thu, 10 Oct 2024 21:26:24 -0700
Subject: [PATCH 86/94] Fix ray docs (#761)

---
 docs/Getting-Started-TPU-VM.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/Getting-Started-TPU-VM.md b/docs/Getting-Started-TPU-VM.md
index d0728d1c1..20fdaa765 100644
--- a/docs/Getting-Started-TPU-VM.md
+++ b/docs/Getting-Started-TPU-VM.md
@@ -233,7 +233,7 @@ Then, **in a separate terminal**, you can submit a job to the cluster. To replic
 
 ```bash
 export RAY_ADDRESS=http://localhost:8265  # tell ray where the cluster is
-python infra/launch_on_ray.py --tpu_type v4-32 --foreground --config_path config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
+python infra/launch_on_ray.py --tpu_type v4-32 --foreground -- python src/levanter/main/train_lm.py --config_path config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
 ```
 
 Even without `--foreground`, the job will be restarted if it fails. The `--tpu_type` flag is required, and should be

From 51f9bf1a012a82115aa650adce83f8142c8f3cc0 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Thu, 10 Oct 2024 23:54:48 -0700
Subject: [PATCH 87/94] ensure everything always uses at least some CPU to
 avoid flooding ray head node, add code to change max size of actor pool

---
 src/levanter/data/_preprocessor.py |  8 +++++++-
 src/levanter/store/cache.py        |  6 +++++-
 src/levanter/utils/actor_pool.py   | 19 +++++++++++++++++++
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/src/levanter/data/_preprocessor.py b/src/levanter/data/_preprocessor.py
index 573015852..77b91617f 100644
--- a/src/levanter/data/_preprocessor.py
+++ b/src/levanter/data/_preprocessor.py
@@ -236,7 +236,7 @@ def to_hf_batched(x):
     return {b.field(i).name: to_hf_batched(b.column(i).to_numpy(zero_copy_only=False)) for i in range(b.num_columns)}
 
 
-@ray.remote(num_cpus=0)
+@ray.remote(num_cpus=0.1)  # keep this low b/c it doesn't do much
 class BatchProcessorPool:
     def __init__(self, processor: BatchProcessor, min_size: int = 1, max_size: int = 10):
         logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(message)s")
@@ -253,6 +253,12 @@ async def process_batch(self, batch_ref: RefBox):
     def num_pending_tasks(self):
         return self.actor_pool.num_pending_tasks
 
+    def resize_pool(self, *, min_size: int | None = None, max_size: int | None = None):
+        self.actor_pool.resize_pool(min_size=min_size, max_size=max_size)
+
+    def ensure_max_at_least(self, size: int):
+        self.actor_pool.resize_pool(max_size=max(size, self.actor_pool.get_max_size()))
+
 
 def _create_batch_processor_actor(processor: BatchProcessor, processor_ref):
     cpus = processor.num_cpus
diff --git a/src/levanter/store/cache.py b/src/levanter/store/cache.py
index ee1969a03..218bce95c 100644
--- a/src/levanter/store/cache.py
+++ b/src/levanter/store/cache.py
@@ -1199,7 +1199,9 @@ def generator():
 
     readers = [
         RayPrefetchQueue(
-            fn, options.prefetch_per_group, producer_options=dict(num_cpus=0, name=name, scheduling_strategy="SPREAD")
+            fn,
+            options.prefetch_per_group,
+            producer_options=dict(num_cpus=0.1, name=name, scheduling_strategy="SPREAD"),
         )
         for name, fn in zip(group_names, generator_fns)
     ]
@@ -1224,6 +1226,8 @@ def _mk_processor_pool(split, processor, min_size, max_size):
         processor, min_size, max_size
     )
 
+    ray.get(processor_pool.ensure_max_at_least.remote(max_size))
+
     return processor_pool
 
 
diff --git a/src/levanter/utils/actor_pool.py b/src/levanter/utils/actor_pool.py
index 76c3ca8fb..f40834bb5 100644
--- a/src/levanter/utils/actor_pool.py
+++ b/src/levanter/utils/actor_pool.py
@@ -50,6 +50,25 @@ def __init__(
     def num_pending_tasks(self):
         return len(self._tasks_waiting_for_actor)
 
+    def resize_pool(self, *, min_size: Optional[int] = None, max_size: Optional[int] = None):
+        old_min_size = self._min_size
+        if min_size is not None:
+            self._min_size = min_size
+        old_max_size = self._max_size
+        if max_size is not None:
+            self._max_size = max_size
+
+        if old_min_size != self._min_size or old_max_size != self._max_size:
+            logger.info(f"Resizing pool to min_size: {self._min_size}, max_size: {self._max_size}")
+
+        self._adjust_pool_size()
+
+    def get_max_size(self):
+        return self._max_size
+
+    def get_min_size(self):
+        return self._min_size
+
     def _scale_up(self, num_actors: int):
         if self._scale_down_task and not self._scale_down_task.done():
             self._scale_down_task.cancel()

From c3b3dd80503b9246a880e6705c006a4b6ebcae75 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Fri, 11 Oct 2024 15:14:51 -0700
Subject: [PATCH 88/94] cap the size of the core writer task rather than the
 number of batches (#762)

This is marginally slower, but pile now builds fine on a v4-32, which is
an improvement.
---
 src/levanter/store/cache.py        | 118 ++++++++++++++++++-----------
 src/levanter/store/jagged_array.py |   4 +
 src/levanter/utils/actor_pool.py   |   5 +-
 tests/test_new_cache.py            |  11 ++-
 4 files changed, 90 insertions(+), 48 deletions(-)

diff --git a/src/levanter/store/cache.py b/src/levanter/store/cache.py
index 218bce95c..45265c994 100644
--- a/src/levanter/store/cache.py
+++ b/src/levanter/store/cache.py
@@ -16,6 +16,7 @@
 
 import deepdiff
 import fsspec.core
+import humanfriendly
 import jax
 import pyarrow as pa
 import ray
@@ -77,12 +78,16 @@ class CacheOptions:
     process. Lower values will use less memory but take somewhat longer to build the cache."""
 
     # the below options don't actually impact the cache's result, but do impact construction
-    num_batches_per_flush = 256
-    """The number of batches to process before flushing the cache to disk. This is used to control the memory usage of
-    the cache building process. Lower values will use less memory but may take somewhat longer to build the cache."""
+    target_size_per_flush: int | str = "512MB"
+    """The number of bytes to buffer before flushing to disk. This is used to control the memory usage of the cache
+    building process. Lower values will use less memory but could take somewhat longer to build the cache."""
     prefetch_per_group: int = 4
     """The number of batches to prefetch per group. This is used to keep the processors busy and to reduce the time"""
 
+    @property
+    def target_bytes_per_flush(self):
+        return humanfriendly.parse_size(self.target_size_per_flush)
+
     @staticmethod
     def default():
         return CacheOptions()
@@ -684,6 +689,10 @@ def write_batch(self, shard_name: str, batch: BatchResult):
     def finish(self):
         # if successful, write the ledger
         logger.info("Finished writing cache")
+        # check that all shards are finished
+        if set(self._ledger.shard_rows.keys()) != set(self._ledger.finished_shards):
+            raise ValueError("Not all shards are finished")
+
         self._ledger.is_finished = True
         self._ledger._serialize_and_commit(self.cache_dir)
         if self._on_write:
@@ -755,8 +764,13 @@ def __init__(
                 self.logger.info("Cache already finished. Nothing to do.")
                 return
             self._cache_writer = _core_writer_task.options(
-                name=f"writer::{path_for_name}", scheduling_strategy="SPREAD"
-            ).remote(current_actor_handle(), cache_dir, split, self._ledger, source, processor, force_flush)
+                name=f"writer::{path_for_name}",
+                scheduling_strategy="SPREAD",
+                # memory needed for the writer is twice the options' target size per flush
+                # (we get twice from we need to concatenate prepared batches into the accumulator)
+                # TODO: measure.
+                memory=2 * self._options.target_bytes_per_flush,
+            ).remote(current_actor_handle(), cache_dir, self._ledger, source, processor, force_flush)
         except Exception:
             # Ray behaves poorly if the constructor of an actor fails, so we catch and log here
             # this also propagates to the finished promise, so we can handle it there
@@ -767,14 +781,6 @@ def current_ledger(self):
             raise self._finished_promise.exception()
         return self._ledger
 
-    def other_failed(self, error: ExceptionInfo):
-        """Callback method for when a shard worker has failed."""
-        self._writer_exception(None, error)
-
-    def _child_failed(self, child: ray.actor.ActorHandle, exception: ExceptionInfo):
-        self.logger.error(f"Child {child} failed with exception", exc_info=exception.restore())
-        self._writer_exception(None, exception)
-
     def is_finished(self):
         if self.failed():
             return False
@@ -872,13 +878,6 @@ def _get_builder_actor(split, cache_dir, shard_source, processor, options=CacheO
 # a stream of tokenized batches. We then interleave these tokenized batches and write them to the cache.
 # The reader tasks are given a group of shards, which are implicitly concatenated together.
 
-# This is still much slower than I would like but I haven't figured out why yet.
-# TODO:
-# - [ ] Profile the tokenization process more (see TIME comments)
-# - [ ] Try Ray's autoscaling actorpool if the issue is tokenization isn't fast enough
-#       https://github.com/ray-project/ray/blob/1bab09bf842edee51c3778be4cfb16f8b900d764/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py
-# - [ ] More observability into what's queued and how long work items take
-
 
 @dataclass
 class _Batch:
@@ -907,14 +906,12 @@ class _ShardFinished:
 """
 
 _TIME_BETWEEN_WRITES = 20.0  # seconds
-_MIN_WRITE_BATCHES = 100
 
 
 @ray.remote(num_cpus=1)
 def _core_writer_task(
     parent,
     cache_dir,
-    split,
     initial_ledger: CacheLedger,
     source: ShardedDataSource,
     processor,
@@ -948,11 +945,11 @@ def on_write(ledger):
         options = initial_ledger.metadata.options
         num_groups = min(options.num_shard_groups or 1000000, len(source.shard_names))
 
-        processor_pool = _mk_processor_pool(split, processor, 0, num_groups * 4)
+        processor_pool = _mk_processor_pool(processor, 0, num_groups * 4)
 
         interleave: RayPrefetchQueue = RayPrefetchQueue(
             lambda: _make_interleave(name, source, initial_ledger, processor_pool),
-            512,
+            64,
             producer_options={"num_cpus": 1, "name": f"{name}::interleave"},
         )
 
@@ -962,7 +959,8 @@ def on_write(ledger):
         flush_time = Stopwatch()
         flush_amortized_time = Stopwatch()
 
-        batches: list = []
+        current_prepared_batch: Optional[PyTree[PreparedBatch]] = None
+        current_shard_rows: dict[str, int] = {}
         time_of_last_write = time.time()
         batches_total = 0.0
         flush_thread = None
@@ -975,22 +973,36 @@ def on_write(ledger):
                     time_since_last_write = cur_time - time_of_last_write
                     remaining_time = _TIME_BETWEEN_WRITES - time_since_last_write
 
-                    if len(batches) > 0:
+                    if current_prepared_batch is not None:
                         with flush_amortized_time:  # 6e-4
-                            if remaining_time <= 0 or len(batches) >= options.num_batches_per_flush or force_flush:
+                            current_byte_size = sum(
+                                b.byte_size for b in jax.tree_util.tree_flatten(current_prepared_batch)[0]
+                            )
+                            should_flush = (
+                                force_flush
+                                or remaining_time <= 0
+                                or (current_byte_size >= options.target_bytes_per_flush)
+                            )
+                            if should_flush:
                                 with flush_time:  # 0.613s
-                                    shard_rows, payloads = _fetch_batches(batches)
                                     if flush_thread is not None:
                                         flush_thread.join()
 
-                                    batches = []
                                     flush_thread = ExceptionTrackingThread(
                                         target=_write_batches,
-                                        args=(sharded_cache_writer, shard_rows, payloads, finished_shards_last_flush),
+                                        args=(
+                                            sharded_cache_writer,
+                                            current_shard_rows,
+                                            current_prepared_batch,
+                                            finished_shards_last_flush,
+                                        ),
                                     )
                                     flush_thread.start()
 
+                                    current_prepared_batch = None
+                                    current_shard_rows = {}
                                     finished_shards_last_flush = []
+
                                     time_of_last_write = time.time()
                                     continue
                     else:
@@ -1005,18 +1017,30 @@ def on_write(ledger):
 
                     with append_time:
                         match message:
-                            case _Batch(shard, _, payload):
+                            case _Batch(shard, row_indices, payload):
                                 batches_total += 1
-                                batches.append((shard, payload))
+                                this_prepared_batch = ray.get(payload)
+                                if current_prepared_batch is None:
+                                    # TODO: actually check row indices
+                                    current_shard_rows = {shard: len(row_indices)}
+                                    current_prepared_batch = this_prepared_batch
+                                else:
+                                    current_shard_rows[shard] = current_shard_rows.get(shard, 0) + len(row_indices)
+                                    current_prepared_batch = _concat_prepared_batches(
+                                        current_prepared_batch, this_prepared_batch
+                                    )
+                                    del this_prepared_batch
 
                                 if force_flush:
-                                    shard_rows, payloads = _fetch_batches(batches)
-                                    del batches
                                     _write_batches(
-                                        sharded_cache_writer, shard_rows, payloads, finished_shards_last_flush
+                                        sharded_cache_writer,
+                                        current_shard_rows,
+                                        current_prepared_batch,
+                                        finished_shards_last_flush,
                                     )
-                                    batches = []
                                     finished_shards_last_flush = []
+                                    current_prepared_batch = None
+                                    current_shard_rows = {}
 
                             case _ShardFinished(shard, total_rows):
                                 finished_shards_last_flush.append((shard, total_rows))
@@ -1038,12 +1062,12 @@ def on_write(ledger):
                     raise e
 
         # force a flush
-        if len(batches) > 0:
-            shard_row_totals, payloads_for_batches = _fetch_batches(batches)
-            del batches
+        if current_prepared_batch is not None or finished_shards_last_flush:
             if flush_thread is not None:
                 flush_thread.join()
-            _write_batches(sharded_cache_writer, shard_row_totals, payloads_for_batches, finished_shards_last_flush)
+            _write_batches(
+                sharded_cache_writer, current_shard_rows, current_prepared_batch, finished_shards_last_flush
+            )
 
         sharded_cache_writer.finish()
 
@@ -1051,10 +1075,16 @@ def on_write(ledger):
         return out
 
 
-def _write_batches(writer: ShardedCacheWriter, shard_totals, batches, finished_shards):
+def _concat_prepared_batches(
+    current_prepared_batch: PyTree[PreparedBatch], this_prepared_batch: PyTree[PreparedBatch]
+):
+    return jax.tree.map(lambda *bs: PreparedBatch.concat(bs), current_prepared_batch, this_prepared_batch)
+
+
+def _write_batches(writer: ShardedCacheWriter, shard_totals, batch: Optional[PyTree[PreparedBatch]], finished_shards):
     # concatenate the payloads
-    final_payload = jax.tree.map(lambda *bs: PreparedBatch.concat(bs), *batches)
-    writer.write_prepared_batch(shard_totals, final_payload)
+    if batch is not None:
+        writer.write_prepared_batch(shard_totals, batch)
 
     for shard, total_rows in finished_shards:
         writer.finish_shard(shard, total_rows)
@@ -1215,7 +1245,7 @@ def generator():
     yield from _interleave_shards(readers, first_group_to_start)
 
 
-def _mk_processor_pool(split, processor, min_size, max_size):
+def _mk_processor_pool(processor, min_size, max_size):
     import hashlib
 
     metadata_hash = hashlib.md5(str(processor.metadata).encode()).hexdigest()
diff --git a/src/levanter/store/jagged_array.py b/src/levanter/store/jagged_array.py
index d6674774f..1013d0e34 100644
--- a/src/levanter/store/jagged_array.py
+++ b/src/levanter/store/jagged_array.py
@@ -29,6 +29,10 @@ class PreparedBatch:
     offsets: np.ndarray
     shapes: Optional[np.ndarray]
 
+    @property
+    def byte_size(self):
+        return self.data.nbytes + self.offsets.nbytes + (self.shapes.nbytes if self.shapes is not None else 0)
+
     def astype(self, dtype):
         return PreparedBatch(self.data.astype(dtype), self.offsets, self.shapes)
 
diff --git a/src/levanter/utils/actor_pool.py b/src/levanter/utils/actor_pool.py
index f40834bb5..a694bee20 100644
--- a/src/levanter/utils/actor_pool.py
+++ b/src/levanter/utils/actor_pool.py
@@ -133,7 +133,7 @@ def _adjust_pool_size(self):
 
         # Schedule scale down if idle
         elif num_pending_tasks == 0 and num_nonworking_actors > self._min_size:
-            if self._scale_down_task is None or self._scale_down_task.done():
+            if self._scale_down_task is None:
                 self._scale_down_task = asyncio.create_task(self._schedule_scale_down())
 
     async def _schedule_scale_down(self):
@@ -142,8 +142,9 @@ async def _schedule_scale_down(self):
             if self.num_pending_tasks == 0:
                 logger.info("Scaling down due to no pending tasks.")
                 self._scale_down(self._min_size)
+                self._scale_down_task = None
         except asyncio.CancelledError:
-            logger.info("Scale down task was cancelled due to new activity.")
+            logger.debug("Scale down task was cancelled due to new activity.")
 
     def _get_object_location(self, obj_ref: ray.ObjectRef) -> Optional[str]:
         """Get the location of the given object reference."""
diff --git a/tests/test_new_cache.py b/tests/test_new_cache.py
index 275c6a236..c1eb73670 100644
--- a/tests/test_new_cache.py
+++ b/tests/test_new_cache.py
@@ -97,8 +97,9 @@ def metadata(self) -> Dict[str, Any]:
 
 
 class SimpleShardSource(ShardedDataSource[list[int]]):
-    def __init__(self, num_shards: int = 4):
+    def __init__(self, num_shards: int = 4, rows_per_shard: int = 10):
         self._num_shards = num_shards
+        self._rows_per_shard = rows_per_shard
 
     @property
     def shard_names(self) -> Sequence[str]:
@@ -107,7 +108,7 @@ def shard_names(self) -> Sequence[str]:
     def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[list[int]]:
         # parse the shard name to get the shard number
         shard_num = int(shard_name.split("_")[1])
-        return ([shard_num * 10 + i] * 10 for i in range(row, 10))
+        return ([shard_num * 10 + i] * 10 for i in range(row, self._rows_per_shard))
 
 
 def test_serial_cache_writer():
@@ -465,6 +466,9 @@ def test_sharded_cache_writer():
             for ex in batched(source.open_shard(shard_name), ledger.metadata.options.batch_size):
                 writer.write_batch(shard_name, processor(ex))
 
+        for shard_name in source.shard_names:
+            writer.finish_shard(shard_name, source._rows_per_shard)
+
         store = writer.finish()
 
         data_path = store.path
@@ -501,6 +505,9 @@ def test_sharded_cache_writer_trims_on_resume():
             for ex in batched(source.open_shard(shard_name), 8):
                 writer.write_batch(shard_name, processor(ex))
 
+        for shard_name in source.shard_names:
+            writer.finish_shard(shard_name, 10)
+
         writer.finish()
 
         # now deliberately truncate the ledger a bit

From 52bff4f9980dfdb7873a1bef2995fb7e74f797ee Mon Sep 17 00:00:00 2001
From: Nikil Ravi <nravi@stanford.edu>
Date: Sun, 13 Oct 2024 15:01:40 -0700
Subject: [PATCH 89/94] add parquet support

---
 src/levanter/data/sharded_datasource.py | 31 +++++++++++++++++-
 tests/test_sharded_dataset.py           | 43 ++++++++++++++++++++++++-
 2 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/src/levanter/data/sharded_datasource.py b/src/levanter/data/sharded_datasource.py
index 38682616d..494bd5f05 100644
--- a/src/levanter/data/sharded_datasource.py
+++ b/src/levanter/data/sharded_datasource.py
@@ -20,6 +20,8 @@
 import datasets
 import fsspec
 import numpy as np
+import pyarrow.parquet as pq
+import pandas as pd
 
 from levanter.utils import fsspec_utils
 
@@ -149,6 +151,10 @@ def datasource_from_json(urls_or_paths: Sequence[str]) -> ShardedDataSource[dict
     return JsonDataSource(urls_or_paths)
 
 
+def datasource_from_parquet(urls_or_paths: Sequence[str]) -> ShardedDataSource[dict]:
+    return ParquetDataSource(urls_or_paths)
+
+
 class WrappedHFDataSource(ShardedDataSource[dict]):
     """
     This class is responsible for loading a dataset from HuggingFace Datasets and returning the shards.
@@ -238,6 +244,11 @@ def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[str]:
                     data = json.load(f)
                     for doc in data[row:]:
                         yield doc[self.text_key]
+                case ".parquet":
+                    table = pq.read_table(f)
+                    sliced_table = table.slice(row)
+                    for record in sliced_table.to_pylist():
+                        yield record[self.text_key] # assumes text_key is in record
                 case _:
                     raise ValueError(f"Unknown format {format}")
 
@@ -313,7 +324,7 @@ def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[Tuple[np.ndar
 
 
 def _sniff_format_for_dataset(url):
-    good_formats = [".jsonl", ".txt", ".json"]
+    good_formats = [".jsonl", ".txt", ".json", ".parquet"]
     format_from_url = None
     # try both with and without compression (could be gz, bz2, etc, so look at the "first" extension)
     extensions = [os.path.splitext(url)[1], os.path.splitext(os.path.splitext(url)[0])[1]]
@@ -417,6 +428,24 @@ def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[dict]:
             return iter(data[row:])
 
 
+class ParquetDataSource(ShardedDataSource[dict]):
+    def __init__(self, urls):
+        self.urls = urls
+        self._shard_name_to_url_mapping = _mk_shard_name_mapping(urls)
+
+    @property
+    def shard_names(self) -> Sequence[str]:
+        return list(self._shard_name_to_url_mapping.keys())
+
+    def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[dict]:
+        url = self._shard_name_to_url_mapping[shard_name]
+        with fsspec.open(url, "r", compression="infer") as f:
+            table = pq.read_table(f)
+            sliced_table = table.slice(row) # zero-copy slicing
+            for record in sliced_table.to_pylist():
+                yield record
+
+
 def _mk_shard_name_mapping(urls):
     _shard_name_to_url_mapping = {}
     # remove common prefix
diff --git a/tests/test_sharded_dataset.py b/tests/test_sharded_dataset.py
index b3c8bcc8d..3cf0d78e6 100644
--- a/tests/test_sharded_dataset.py
+++ b/tests/test_sharded_dataset.py
@@ -1,6 +1,6 @@
 import tempfile
 
-from levanter.data.sharded_datasource import AudioTextUrlDataSource, _sniff_format_for_dataset
+from levanter.data.sharded_datasource import AudioTextUrlDataSource, _sniff_format_for_dataset, ParquetDataSource
 from test_utils import skip_if_no_soundlibs
 
 
@@ -24,6 +24,47 @@ def test_sniff_format_for_json():
         assert _sniff_format_for_dataset(f.name) == ".json"
 
 
+def test_sniff_format_for_parquet():
+
+    import pyarrow as pa
+    import pyarrow.parquet as pq
+
+    with tempfile.NamedTemporaryFile(suffix=".parquet") as f:
+        table = pa.table({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']})
+        pq.write_table(table, f.name)
+        f.flush()
+
+        assert _sniff_format_for_dataset(f.name) == ".parquet"
+    
+
 @skip_if_no_soundlibs
 def test_resolve_audio_pointer():
     AudioTextUrlDataSource.resolve_audio_pointer("https://ccrma.stanford.edu/~jos/mp3/trumpet.mp3", 16_000)
+
+
+def test_basic_parquet_datasource_read_row():
+
+    import pyarrow as pa
+    import pyarrow.parquet as pq
+
+    with tempfile.NamedTemporaryFile(suffix=".parquet") as f:
+        # Create a simple dataset
+        data = {
+            "column1": ["value1", "value2", "value3"],
+            "column2": [10, 20, 30]
+        }
+        table = pa.Table.from_pydict(data)
+        pq.write_table(table, f.name)
+
+        # Instantiate the ParquetDataSource
+        datasource = ParquetDataSource([f.name])
+
+        # sanity check: Read data starting from row 1
+        row_data = list(datasource.open_shard_at_row(shard_name=f.name.replace(".", "_"), row=1))
+
+        # Verify the output
+        assert len(row_data) == 2  # We expect 2 rows starting from index 1
+        assert row_data[0]["column1"] == "value2"
+        assert row_data[0]["column2"] == 20
+        assert row_data[1]["column1"] == "value3"
+        assert row_data[1]["column2"] == 30
\ No newline at end of file

From af78281e9dbf47163c980d54981ed736000f7280 Mon Sep 17 00:00:00 2001
From: Nikil Ravi <nravi@stanford.edu>
Date: Sun, 13 Oct 2024 15:15:06 -0700
Subject: [PATCH 90/94] lint, shard name fix

---
 tests/test_sharded_dataset.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/test_sharded_dataset.py b/tests/test_sharded_dataset.py
index 3cf0d78e6..2fb9cf4d8 100644
--- a/tests/test_sharded_dataset.py
+++ b/tests/test_sharded_dataset.py
@@ -35,7 +35,7 @@ def test_sniff_format_for_parquet():
         f.flush()
 
         assert _sniff_format_for_dataset(f.name) == ".parquet"
-    
+
 
 @skip_if_no_soundlibs
 def test_resolve_audio_pointer():
@@ -56,15 +56,17 @@ def test_basic_parquet_datasource_read_row():
         table = pa.Table.from_pydict(data)
         pq.write_table(table, f.name)
 
-        # Instantiate the ParquetDataSource
         datasource = ParquetDataSource([f.name])
 
+        assert len(datasource.shard_names) == 1, "Expected only one shard"
+        shard_name = datasource.shard_names[0]
+
         # sanity check: Read data starting from row 1
-        row_data = list(datasource.open_shard_at_row(shard_name=f.name.replace(".", "_"), row=1))
+        row_data = list(datasource.open_shard_at_row(shard_name=shard_name, row=1))
 
         # Verify the output
         assert len(row_data) == 2  # We expect 2 rows starting from index 1
         assert row_data[0]["column1"] == "value2"
         assert row_data[0]["column2"] == 20
         assert row_data[1]["column1"] == "value3"
-        assert row_data[1]["column2"] == 30
\ No newline at end of file
+        assert row_data[1]["column2"] == 30

From 8d09cfd1c216a7bc6f302ae529ae8d6c4412b005 Mon Sep 17 00:00:00 2001
From: Nikil Ravi <nravi@stanford.edu>
Date: Sun, 13 Oct 2024 15:38:47 -0700
Subject: [PATCH 91/94] pre-commit

---
 src/levanter/data/sharded_datasource.py |  5 ++---
 tests/test_sharded_dataset.py           | 23 +++++++++++++++--------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/src/levanter/data/sharded_datasource.py b/src/levanter/data/sharded_datasource.py
index 494bd5f05..10eb42b1b 100644
--- a/src/levanter/data/sharded_datasource.py
+++ b/src/levanter/data/sharded_datasource.py
@@ -21,7 +21,6 @@
 import fsspec
 import numpy as np
 import pyarrow.parquet as pq
-import pandas as pd
 
 from levanter.utils import fsspec_utils
 
@@ -248,7 +247,7 @@ def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[str]:
                     table = pq.read_table(f)
                     sliced_table = table.slice(row)
                     for record in sliced_table.to_pylist():
-                        yield record[self.text_key] # assumes text_key is in record
+                        yield record[self.text_key]  # assumes text_key is in record
                 case _:
                     raise ValueError(f"Unknown format {format}")
 
@@ -441,7 +440,7 @@ def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[dict]:
         url = self._shard_name_to_url_mapping[shard_name]
         with fsspec.open(url, "r", compression="infer") as f:
             table = pq.read_table(f)
-            sliced_table = table.slice(row) # zero-copy slicing
+            sliced_table = table.slice(row)  # zero-copy slicing
             for record in sliced_table.to_pylist():
                 yield record
 
diff --git a/tests/test_sharded_dataset.py b/tests/test_sharded_dataset.py
index 2fb9cf4d8..b732596e5 100644
--- a/tests/test_sharded_dataset.py
+++ b/tests/test_sharded_dataset.py
@@ -1,6 +1,7 @@
+import os
 import tempfile
 
-from levanter.data.sharded_datasource import AudioTextUrlDataSource, _sniff_format_for_dataset, ParquetDataSource
+from levanter.data.sharded_datasource import AudioTextUrlDataSource, ParquetDataSource, _sniff_format_for_dataset
 from test_utils import skip_if_no_soundlibs
 
 
@@ -30,7 +31,7 @@ def test_sniff_format_for_parquet():
     import pyarrow.parquet as pq
 
     with tempfile.NamedTemporaryFile(suffix=".parquet") as f:
-        table = pa.table({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']})
+        table = pa.table({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
         pq.write_table(table, f.name)
         f.flush()
 
@@ -47,20 +48,23 @@ def test_basic_parquet_datasource_read_row():
     import pyarrow as pa
     import pyarrow.parquet as pq
 
-    with tempfile.NamedTemporaryFile(suffix=".parquet") as f:
+    with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as f:
         # Create a simple dataset
-        data = {
-            "column1": ["value1", "value2", "value3"],
-            "column2": [10, 20, 30]
-        }
+        data = {"column1": ["value1", "value2", "value3"], "column2": [10, 20, 30]}
         table = pa.Table.from_pydict(data)
         pq.write_table(table, f.name)
 
-        datasource = ParquetDataSource([f.name])
+    try:
+
+        datasource = ParquetDataSource([os.path.abspath(f.name)])
 
         assert len(datasource.shard_names) == 1, "Expected only one shard"
         shard_name = datasource.shard_names[0]
 
+        print(f"Shard name: {shard_name}")
+        print("File name: ", f.name)
+        print("File path: ", os.path.abspath(f.name))
+
         # sanity check: Read data starting from row 1
         row_data = list(datasource.open_shard_at_row(shard_name=shard_name, row=1))
 
@@ -70,3 +74,6 @@ def test_basic_parquet_datasource_read_row():
         assert row_data[0]["column2"] == 20
         assert row_data[1]["column1"] == "value3"
         assert row_data[1]["column2"] == 30
+
+    finally:
+        os.unlink(f.name)

From 50715e9bc64d05dfb655087862c826d59a377ad7 Mon Sep 17 00:00:00 2001
From: Nikil Ravi <nravi@stanford.edu>
Date: Sun, 13 Oct 2024 15:49:49 -0700
Subject: [PATCH 92/94] read as binary file

---
 src/levanter/data/sharded_datasource.py | 2 +-
 tests/test_sharded_dataset.py           | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/levanter/data/sharded_datasource.py b/src/levanter/data/sharded_datasource.py
index 10eb42b1b..208116ca6 100644
--- a/src/levanter/data/sharded_datasource.py
+++ b/src/levanter/data/sharded_datasource.py
@@ -438,7 +438,7 @@ def shard_names(self) -> Sequence[str]:
 
     def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[dict]:
         url = self._shard_name_to_url_mapping[shard_name]
-        with fsspec.open(url, "r", compression="infer") as f:
+        with fsspec.open(url, "rb", compression="infer") as f:
             table = pq.read_table(f)
             sliced_table = table.slice(row)  # zero-copy slicing
             for record in sliced_table.to_pylist():
diff --git a/tests/test_sharded_dataset.py b/tests/test_sharded_dataset.py
index b732596e5..265a70867 100644
--- a/tests/test_sharded_dataset.py
+++ b/tests/test_sharded_dataset.py
@@ -61,10 +61,6 @@ def test_basic_parquet_datasource_read_row():
         assert len(datasource.shard_names) == 1, "Expected only one shard"
         shard_name = datasource.shard_names[0]
 
-        print(f"Shard name: {shard_name}")
-        print("File name: ", f.name)
-        print("File path: ", os.path.abspath(f.name))
-
         # sanity check: Read data starting from row 1
         row_data = list(datasource.open_shard_at_row(shard_name=shard_name, row=1))
 

From 3fe89957b55f799c1eb42200d8152dbdecd50c21 Mon Sep 17 00:00:00 2001
From: Nikil Ravi <nravi@stanford.edu>
Date: Sun, 13 Oct 2024 18:59:37 -0700
Subject: [PATCH 93/94] simplify test

---
 tests/test_sharded_dataset.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tests/test_sharded_dataset.py b/tests/test_sharded_dataset.py
index 265a70867..90ab6c34b 100644
--- a/tests/test_sharded_dataset.py
+++ b/tests/test_sharded_dataset.py
@@ -48,14 +48,12 @@ def test_basic_parquet_datasource_read_row():
     import pyarrow as pa
     import pyarrow.parquet as pq
 
-    with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as f:
+    with tempfile.NamedTemporaryFile(suffix=".parquet", delete=True) as f:
         # Create a simple dataset
         data = {"column1": ["value1", "value2", "value3"], "column2": [10, 20, 30]}
         table = pa.Table.from_pydict(data)
         pq.write_table(table, f.name)
 
-    try:
-
         datasource = ParquetDataSource([os.path.abspath(f.name)])
 
         assert len(datasource.shard_names) == 1, "Expected only one shard"
@@ -70,6 +68,3 @@ def test_basic_parquet_datasource_read_row():
         assert row_data[0]["column2"] == 20
         assert row_data[1]["column1"] == "value3"
         assert row_data[1]["column2"] == 30
-
-    finally:
-        os.unlink(f.name)

From 02f34acbc27050be1d505fe80493758f9b52e83e Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@cs.stanford.edu>
Date: Mon, 14 Oct 2024 14:32:30 -0700
Subject: [PATCH 94/94] fix crash in data loader caused by using stale array
 (#765)

---
 src/levanter/data/loader.py | 235 +++++++++++++++++++-----------------
 tests/test_doremi.py        |  58 ++++-----
 2 files changed, 155 insertions(+), 138 deletions(-)

diff --git a/src/levanter/data/loader.py b/src/levanter/data/loader.py
index ab97e0827..928c9456c 100644
--- a/src/levanter/data/loader.py
+++ b/src/levanter/data/loader.py
@@ -2,7 +2,7 @@
 import logging
 import time
 from collections import defaultdict
-from typing import Iterable, Iterator, Optional, Tuple, TypeVar
+from typing import AsyncIterator, Callable, Iterable, Iterator, Optional, Tuple, TypeVar
 
 import jax
 from jax import Array
@@ -20,8 +20,9 @@
 from levanter.data.dataset import AsyncDataset
 from levanter.data.utils import batched
 from levanter.shapes import NamedShapeSpec, ShapeSpec, to_raw_shape
-from levanter.utils.background_iterable import BackgroundIterable
-from levanter.utils.thread_utils import blocking_wait
+from levanter.utils.background_iterable import BackgroundIterator
+from levanter.utils.jax_utils import local_cpu_mesh
+from levanter.utils.thread_utils import AsyncIteratorWrapper, blocking_wait
 
 
 Ex = TypeVar("Ex")
@@ -62,10 +63,11 @@ def __init__(
         self.mesh = mesh
         self.Batch = Batch
 
-        def _exemplar_shape():
-            return blocking_wait(self.data_store.getitem_async(0))
-
-        self._ex_leaves, self._ex_structure = jax.tree_flatten(_exemplar_shape(), is_leaf=is_named_array)
+        with local_cpu_mesh():
+            # It's important that all data loading happens CPU side. We might relax this one day.
+            self._ex_leaves, self._ex_structure = jax.tree_flatten(
+                blocking_wait(self.data_store.getitem_async(0)), is_leaf=is_named_array
+            )
 
         local_device_indices, local_indices = self._compute_local_device_indices()
 
@@ -98,6 +100,8 @@ def __iter__(self):
         return self.iter_from_step(None)
 
     def iter_from_step(self, start_from_batch: Optional[int] = None):
+        # sometimes we pass in an array for the start_from_batch, so we need to check for that
+        start_from_batch = int(start_from_batch) if start_from_batch is not None else None
         return DataLoaderIterator(self, start_from_batch=start_from_batch)
 
 
@@ -109,115 +113,131 @@ def __init__(self, data_loader: DataLoader, start_from_batch: Optional[int] = No
         if self.mapping is None:
             self.mapping = hax.partitioning.current_thread_local_mapping()
 
-        # TODO: bring back non-prefetching version
         buffered_batches = self.dl.max_buffered_batches
-        self._batches = iter(BackgroundIterable(self._produce_batches, max_capacity=buffered_batches))
+        self._batches: Iterator[Ex]
+        if buffered_batches == 0:
+            self._batches = AsyncIteratorWrapper(self._produce_batches())
+        else:
+            self._batches = _JaxCpuBackgroundIterator(self._produce_batches, max_capacity=buffered_batches)
 
     def __next__(self):
         time_start = time.time()
-        out = next(self._batches)
+        individual_data_batch = next(self._batches)
+        data_for_this_batch = {index: datum for index, datum in zip(self.dl._local_indices, individual_data_batch)}
+        batch = self._batchify_local_data(data_for_this_batch)
+
         time_end = time.time()
         if (time_end - time_start) > 0.5:
             logger.info(f"Prefetch wasn't fast enough: {time_end - time_start:.3f}")
-        return out
+        return batch
 
     async def _produce_batches(self):
         batch_number = self._start_from_batch or 0
-        total_ex_loaded = 0
         done = False
         while not done:
-            next_batch_numbers = []
-            for i in range(self.dl.prefetch_size):
-                if self.dl.data_store.is_finite():
-                    next_end = (batch_number + 1) * self.dl.batch_size
-                    available_len = await self.dl.data_store.wait_until_len_at_least(next_end)
-                    if available_len < next_end:
-                        done = True
-                        break
-
-                next_batch_numbers.append(batch_number)
-                batch_number += 1
+            target_next_batch_number = batch_number + self.dl.prefetch_size
+            max_achievable_batch_number = await self._dataset_get_available_batch_number(target_next_batch_number)
+            if max_achievable_batch_number < target_next_batch_number:
+                done = True
+
+            next_batch_numbers = list(range(batch_number, min(target_next_batch_number, max_achievable_batch_number)))
+
+            if len(next_batch_numbers) == 0:
+                break
+
+            batch_number = next_batch_numbers[-1] + 1
 
             async for batch in self._retrieve_batches(next_batch_numbers):
                 yield batch
 
-            total_ex_loaded += self.dl.batch_size * len(next_batch_numbers)
+    async def _dataset_get_available_batch_number(self, target_max_batch_number: int) -> int:
+        if self.dl.data_store.is_finite():
+            next_end = (target_max_batch_number + 1) * self.dl.batch_size
+            available_len = await self.dl.data_store.wait_until_len_at_least(next_end)
+            max_achievable_batch_number = available_len // self.dl.batch_size
 
-    async def _retrieve_batches(self, batch_numbers: list[int]):
-        with hax.axis_mapping(self.mapping), self.dl.mesh:
-            indices_for_this_batch_of_batches: list[int] = []
-            for bn in batch_numbers:
-                indices_this_batch = range(bn * self.dl.batch_size, (bn + 1) * self.dl.batch_size, 1)
-                indices_this_batch_this_process = [indices_this_batch[i] for i in self.dl._local_indices]
-                indices_for_this_batch_of_batches.extend(indices_this_batch_this_process)
+            return max_achievable_batch_number
+
+        return target_max_batch_number
 
+    async def _retrieve_batches(self, batch_numbers: list[int]):
+        with local_cpu_mesh():
             time_start = time.time()
-            individual_datums = await self.dl.data_store.get_batch(indices_for_this_batch_of_batches)
+            individual_datums_for_each_batch = await self._do_retrieve_batch_of_batches(batch_numbers)
+            # reshape to be per batch
             time_end = time.time()
             logger.debug(f"Time to get {len(batch_numbers)} batches: {time_end - time_start:.3f}")
-            time_start = time.time()
-            # reshape to be per batch
-            individual_datums = list(batched(individual_datums, len(self.dl._local_indices)))
-
-            # below we're gonna get the indices relative to this batch (i.e. 0 to batch_size)
-            index_to_datum = [
-                {index: datum for index, datum in zip(self.dl._local_indices, individual_data_batch)}
-                for individual_data_batch in individual_datums
-            ]
-
-            def get_local_batch(bn: int, begin: int, end: int) -> list:
-                # TODO: if we ever do "big data" (i.e. huge examples) we might want to be able to load part of an example
-                # which will require support from the datastore (i.e. tensorstore)
-                device_batch = _stack_tree(self.dl.Batch.name, [index_to_datum[bn][i] for i in range(begin, end)])
-                batch_leaves = hax.tree_util.tree_leaves(device_batch)
-                return batch_leaves
-
-            def get_local_data_for_leaf(bn, indices: _TensorSliceIndex, leaf_index: int) -> Array:
-                batch_slice = indices[0]
-                begin, end, stride = batch_slice.indices(self.dl.batch_size)
-                if stride != 1:
-                    raise ValueError("Stride must be 1")
-
-                leaf_data = (get_local_batch(bn, begin, end))[leaf_index]
-
-                if isinstance(leaf_data, hax.NamedArray):
-                    # select out the batch axis
-                    batch_index = index_where(lambda ax: ax.name == self.dl.Batch.name, leaf_data.axes)
-                    new_indices = list(indices)
-                    new_indices[batch_index] = slice(None)
-                    return leaf_data.array[tuple(new_indices)]
 
+            for data in individual_datums_for_each_batch:
+                yield data
+
+    def _batchify_local_data(self, data_for_this_batch: dict[int, Array]):
+        cache: dict[tuple[int, int], list[Array | hax.NamedArray]] = {}
+
+        def get_local_batch(begin: int, end: int) -> list:
+            if (begin, end) in cache:
+                return cache[(begin, end)]
+
+            # TODO: if we ever do "big data" (i.e. huge examples) we might want to be able to load part of an example
+            # which will require support from the datastore (i.e. tensorstore)
+            device_batch = _stack_tree(self.dl.Batch.name, [data_for_this_batch[i] for i in range(begin, end)])
+            batch_leaves = hax.tree_util.tree_leaves(device_batch)
+
+            cache[(begin, end)] = batch_leaves
+
+            return batch_leaves
+
+        def get_local_data_for_leaf(indices: _TensorSliceIndex, leaf_index: int) -> Array:
+            batch_slice = indices[0]
+            begin, end, stride = batch_slice.indices(self.dl.batch_size)
+            if stride != 1:
+                raise ValueError("Stride must be 1")
+
+            leaf_data = get_local_batch(begin, end)[leaf_index]
+
+            if isinstance(leaf_data, hax.NamedArray):
+                # select out the batch axis
+                batch_index = index_where(lambda ax: ax.name == self.dl.Batch.name, leaf_data.axes)
+                new_indices = list(indices)
+                new_indices[batch_index] = slice(None)
+                return leaf_data.array[tuple(new_indices)]
+            else:
+                other_indices = indices[1:]
+                if all(idx == slice(None) for idx in other_indices):
+                    return leaf_data
                 else:
-                    other_indices = indices[1:]
-                    if all(idx == slice(None) for idx in other_indices):
-                        return leaf_data
-                    else:
-                        # TODO: this doesn't work with named axes
-                        return leaf_data[(..., *other_indices)]
-
-            for batch_offset, bn in enumerate(batch_numbers):
-
-                def make_global_array_for_leaf(leaf_index, item_leaf_shape: ShapeSpec | NamedShapeSpec):
-                    def get_data(indices):
-                        return get_local_data_for_leaf(batch_offset, indices, leaf_index)
-
-                    raw_array = jax.make_array_from_callback(
-                        to_raw_shape(item_leaf_shape),
-                        jax.sharding.NamedSharding(self.dl.mesh, self._pspec_for(item_leaf_shape)),
-                        get_data,
-                    )
-                    if isinstance(item_leaf_shape, NamedShapeSpec):
-                        return hax.NamedArray(raw_array, item_leaf_shape.shape)
-                    else:
-                        return raw_array
-
-                gda_leaves = [
-                    make_global_array_for_leaf(leaf_index, _batchified_shape(self.dl.Batch, item_leaf))
-                    for leaf_index, item_leaf in enumerate(self.dl._ex_leaves)
-                ]
-
-                gda_tree = jax.tree.unflatten(self.dl._ex_structure, gda_leaves)
-                yield gda_tree
+                    return leaf_data[(..., *other_indices)]
+
+        def make_global_array_for_leaf(leaf_index, item_leaf_shape: ShapeSpec | NamedShapeSpec):
+            def get_data(indices):
+                return get_local_data_for_leaf(indices, leaf_index)
+
+            raw_array = jax.make_array_from_callback(
+                to_raw_shape(item_leaf_shape),
+                jax.sharding.NamedSharding(self.dl.mesh, self._pspec_for(item_leaf_shape)),
+                get_data,
+            )
+            if isinstance(item_leaf_shape, NamedShapeSpec):
+                return hax.NamedArray(raw_array, item_leaf_shape.shape)
+            else:
+                return raw_array
+
+        gda_leaves = [
+            make_global_array_for_leaf(leaf_index, _batchified_shape(self.dl.Batch, item_leaf))
+            for leaf_index, item_leaf in enumerate(self.dl._ex_leaves)
+        ]
+        gda_tree = jax.tree.unflatten(self.dl._ex_structure, gda_leaves)
+        return gda_tree
+
+    async def _do_retrieve_batch_of_batches(self, batch_numbers):
+        indices_for_this_batch_of_batches: list[int] = []
+        for bn in batch_numbers:
+            indices_this_batch = range(bn * self.dl.batch_size, (bn + 1) * self.dl.batch_size, 1)
+            indices_this_batch_this_process = [indices_this_batch[i] for i in self.dl._local_indices]
+            indices_for_this_batch_of_batches.extend(indices_this_batch_this_process)
+        individual_datums = await self.dl.data_store.get_batch(indices_for_this_batch_of_batches)
+        individual_datums_for_each_batch = list(batched(individual_datums, len(self.dl._local_indices)))
+        return individual_datums_for_each_batch
 
     def _pspec_for(self, shape_spec: ShapeSpec | NamedShapeSpec) -> PartitionSpec:
         if isinstance(shape_spec, ShapeSpec):  # type: ignore
@@ -227,18 +247,6 @@ def _pspec_for(self, shape_spec: ShapeSpec | NamedShapeSpec) -> PartitionSpec:
             return hax.partitioning.pspec_for_axis(shape_spec.shape, self.dl.axis_resources)  # type: ignore
 
 
-def _abstractify(x):
-    def _abstractify_array(x):
-        if isinstance(x, jax.numpy.ndarray):
-            return ShapeSpec(x.shape, x.dtype)
-        elif isinstance(x, hax.NamedArray):
-            return NamedShapeSpec(x.axes, x.dtype)
-
-        return x
-
-    return hax.tree_util.tree_map(_abstractify_array, x)
-
-
 def _batchified_shape(Batch, leaf: hax.NamedArray | Array) -> ShapeSpec | NamedShapeSpec:
     if is_named_array(leaf):
         return NamedShapeSpec((Batch,) + leaf.axes, leaf.dtype)
@@ -246,12 +254,17 @@ def _batchified_shape(Batch, leaf: hax.NamedArray | Array) -> ShapeSpec | NamedS
         return ShapeSpec((Batch.size,) + leaf.shape, leaf.dtype)
 
 
-def _pspec_for(self, shape_spec: ShapeSpec | NamedShapeSpec) -> PartitionSpec:
-    if isinstance(shape_spec, ShapeSpec):  # type: ignore
-        batch_name = hax.partitioning.physical_axis_name(self.Batch, self.axis_resources)
-        return PartitionSpec(batch_name, *((None,) * (len(shape_spec.shape) - 1)))
-    else:
-        return hax.partitioning.pspec_for_axis(shape_spec.shape, self.axis_resources)  # type: ignore
+class _JaxCpuBackgroundIterator(BackgroundIterator[Ex]):
+    """
+    We want the thread to only use the CPU device.
+    """
+
+    def __init__(self, producer_fn: Callable[[], Iterator[Ex] | AsyncIterator[Ex]], max_capacity: Optional[int]):
+        super().__init__(producer_fn, max_capacity)
+
+    def _fill_queue_with_batches(self):
+        with local_cpu_mesh():
+            super()._fill_queue_with_batches()
 
 
 @functools.partial(jax.jit, static_argnums=(0,))
diff --git a/tests/test_doremi.py b/tests/test_doremi.py
index 8600c9c8b..d2cf8b590 100644
--- a/tests/test_doremi.py
+++ b/tests/test_doremi.py
@@ -15,7 +15,7 @@
 from levanter.data import AsyncDataset
 from levanter.data.mixture import MixtureDataset
 from levanter.trainer import Trainer, TrainerConfig
-from levanter.utils.jax_utils import key_iterator
+from levanter.utils.jax_utils import key_iterator, local_cpu_mesh
 from levanter.utils.py_utils import non_caching_cycle
 
 
@@ -27,6 +27,15 @@ class Example(equinox.Module):
 Block = hax.Axis("Block", 1024)
 
 
+def platform_of_array(x):
+    if isinstance(x, jax.Array):
+        return set(d.platform for d in x.devices())
+    elif isinstance(x, hax.NamedArray):
+        return platform_of_array(x.array)
+    else:
+        return "cpu"
+
+
 class LogitDataset(AsyncDataset[Example]):
     def __init__(self, W, noise, x_mask, x_bias, *, key):
         self.W = W
@@ -52,17 +61,12 @@ def _gen_block_data(block_id):
 
         self._gen_block_data = _gen_block_data
 
-    def __iter__(self):
-        key_iter = key_iterator(self.key)
-        Dim = self.W.axes[0]
-        while True:
-            kk = next(key_iter)
-            this_key_iter = key_iterator(kk)
-            x_block = hax.random.normal(next(this_key_iter), (Block, Dim)) * self.x_mask + self.x_bias
-            noise = hax.random.normal(next(this_key_iter), (Block,)) * self.noise
-            y_block = (hax.nn.sigmoid(hax.dot(x_block, self.W, axis=Dim) + noise) > 0.5).astype(float)
-            for i in range(Block.size):
-                yield self._make_example(x_block, y_block, i)
+    def _make_block(self, Dim, kk):
+        this_key_iter = key_iterator(kk)
+        x_block = hax.random.normal(next(this_key_iter), (Block, Dim)) * self.x_mask + self.x_bias
+        noise = hax.random.normal(next(this_key_iter), (Block,)) * self.noise
+        y_block = (hax.nn.sigmoid(hax.dot(x_block, self.W, axis=Dim) + noise) > 0.5).astype(float)
+        return x_block, y_block
 
     async def async_len(self) -> int:
         raise ValueError("Infinitely long dataset")
@@ -106,21 +110,21 @@ def test_estimate_mixture_weights():
     Dim = hax.Axis("Dim", 5)
     Batch = hax.Axis("Batch", 32)
 
-    keys = key_iterator(0)
-
-    # W = hax.random.normal(next(keys), (Dim,))
-    W1 = hax.named([0.0, 0.5, 0.5, 0.0, 0.0], (Dim,))
-    x1_mask = hax.named([0.0, 1.0, 1.0, 0.0, 0.0], (Dim,))
-    W2 = hax.named([0.0, 0.0, 0.0, 0.0, 0.0], (Dim,))
-    x2_mask = hax.named([0.0, 0.0, 0.0, 1.0, 1.0], (Dim,))
-    W3 = hax.named([1.0, 0.0, 0.0, 0.0, 0.0], (Dim,))
-    x3_mask = hax.named([1.0, 0.0, 0.0, 0.0, 0.0], (Dim,))
-    x3_bias = hax.named([1.0, 0.0, 0.0, 0.0, 0.0], (Dim,))
-
-    # y = sigmoid(Wx + b + N(0, noise^2)) > 0.5
-    ds1 = LogitDataset(W1, 0.1, x1_mask, 0.0, key=next(keys))
-    ds2 = LogitDataset(W2, 2.0, x2_mask, 0.0, key=next(keys))
-    ds3 = LogitDataset(W3, 0.05, x3_mask, x3_bias, key=next(keys))
+    # data loading needs to take place on CPU
+    with local_cpu_mesh():
+        keys = key_iterator(0)
+        W1 = hax.named([0.0, 0.5, 0.5, 0.0, 0.0], (Dim,))
+        x1_mask = hax.named([0.0, 1.0, 1.0, 0.0, 0.0], (Dim,))
+        W2 = hax.named([0.0, 0.0, 0.0, 0.0, 0.0], (Dim,))
+        x2_mask = hax.named([0.0, 0.0, 0.0, 1.0, 1.0], (Dim,))
+        W3 = hax.named([1.0, 0.0, 0.0, 0.0, 0.0], (Dim,))
+        x3_mask = hax.named([1.0, 0.0, 0.0, 0.0, 0.0], (Dim,))
+        x3_bias = hax.named([1.0, 0.0, 0.0, 0.0, 0.0], (Dim,))
+
+        # y = sigmoid(Wx + b + N(0, noise^2)) > 0.5
+        ds1 = LogitDataset(W1, 0.1, x1_mask, 0.0, key=next(keys))
+        ds2 = LogitDataset(W2, 2.0, x2_mask, 0.0, key=next(keys))
+        ds3 = LogitDataset(W3, 0.05, x3_mask, x3_bias, key=next(keys))
 
     # TODO: remove key as a requirement for models
     def compute_loss_fn(model, example, reduction=hax.mean, reduction_axis=None, key=None):