sigh

stanford-crfm · Oct 4, 2024 · 7ad4092 · 7ad4092
1 parent 04ce944
commit 7ad4092
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 4 deletions.
diff --git a/config/data/dclm_gpt_neo.yaml b/config/data/dclm_gpt_neo.yaml
@@ -1,7 +1,8 @@
-cache_dir: "gs://marin-us-central2/scratch/dlwh/tokenized/gpt_neox/"
+cache_dir: "gs://marin-us-central2/tokenized/gpt_neox/"
 tokenizer: "EleutherAI/gpt-neox-20b"
 cache_options:
-  batch_size: 4096
+  batch_size: 256
+  num_shard_groups: 1024
 stop_strategy: restart
 shuffle: 100000
 configs:

diff --git a/config/data/dolma_olmo_paloma.yaml b/config/data/dolma_olmo_paloma.yaml
@@ -135,4 +135,4 @@ train_weights:
   paloma/ptb: 0.0
   paloma/redpajama: 0.0
   paloma/twitterAAE_HELM_fixed: 0.0
-  paloma/wikitext_103: 0.0
+  paloma/wikitext_103: 0.0
diff --git a/src/levanter/store/cache.py b/src/levanter/store/cache.py
@@ -1156,7 +1156,8 @@ def generator():
     generator_fns = [_make_generator_fn(group) for group in groups]
 
     readers = [
-        RayPrefetchQueue(fn, 128, producer_options=dict(name=name)) for name, fn in zip(group_names, generator_fns)
+        RayPrefetchQueue(fn, 128, producer_options=dict(name=name, scheduling_strategy="SPREAD"))
+        for name, fn in zip(group_names, generator_fns)
     ]
 
     # then figure out the first shard to start from. This is the first unfinished shard with the minimum number of rows