diff --git a/torchtrain/models/llama/__init__.py b/torchtrain/models/llama/__init__.py index b40801d0..e6175ca9 100644 --- a/torchtrain/models/llama/__init__.py +++ b/torchtrain/models/llama/__init__.py @@ -7,7 +7,8 @@ llama_configs = { "debugmodel": ModelArgs(dim=256, n_layers=2, n_heads=16), - "1B": ModelArgs(dim=1024, n_layers=16, n_heads=8), + "271M": ModelArgs(dim=1024, n_layers=16, n_heads=8), + "1B": ModelArgs(dim=2048, n_layers=18, n_heads=16), "7B": ModelArgs(dim=4096, n_layers=32, n_heads=32), "13B": ModelArgs(dim=5120, n_layers=40, n_heads=40), "26B": ModelArgs(dim=5120, n_layers=80, n_heads=40), diff --git a/train_configs/debug_model.toml b/train_configs/debug_model.toml index 1cca38b0..f57e14f7 100644 --- a/train_configs/debug_model.toml +++ b/train_configs/debug_model.toml @@ -3,7 +3,7 @@ dump_folder = "./outputs" [profiling] -run_profiler = true +run_profiler = false save_traces_folder = "profiling/traces" # profiling frequency - example: 10 means every 10th iter will be profiled profile_every_x_iter = 10 @@ -15,7 +15,7 @@ log_freq = 10 [model] name = "llama" -flavor = "debugmodel" +flavor = "1B" tokenizer_path = "./torchtrain/datasets/tokenizer/tokenizer.model" [optimizer]