configured 3b

TransformerLensOrg · Sep 26, 2024 · d410201 · d410201
1 parent c4ac5a9
commit d410201
Showing 1 changed file with 60 additions and 0 deletions.
diff --git a/transformer_lens/loading_from_pretrained.py b/transformer_lens/loading_from_pretrained.py
@@ -152,6 +152,9 @@
     "meta-llama/Meta-Llama-3-70B",
     "meta-llama/Meta-Llama-3-70B-Instruct",
     "meta-llama/Llama-3.2-1B",
+    "meta-llama/Llama-3.2-3B",
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "meta-llama/Llama-3.2-3B-Instruct",
     "Baidicoot/Othello-GPT-Transformer-Lens",
     "bert-base-cased",
     "roneneldan/TinyStories-1M",
@@ -905,6 +908,63 @@ def convert_hf_model_config(model_name: str, **kwargs):
             "final_rms": True,
             "gated_mlp": True,
         }
+    elif "Llama-3.2-3B" in official_model_name:
+        cfg_dict = {
+            "d_model": 3072,
+            "d_head": 128,
+            "n_heads": 24,
+            "d_mlp": 8192,
+            "n_layers": 28,
+            "n_ctx": 2048,  # capped due to memory issues
+            "eps": 1e-5,
+            "d_vocab": 128256,
+            "act_fn": "silu",
+            "n_key_value_heads": 8,
+            "normalization_type": "RMS",
+            "positional_embedding_type": "rotary",
+            "rotary_adjacent_pairs": False,
+            "rotary_dim": 128,
+            "final_rms": True,
+            "gated_mlp": True,
+        }
+    elif "Llama-3.2-1B-Instruct" in official_model_name:
+        cfg_dict = {
+            "d_model": 2048,
+            "d_head": 64,
+            "n_heads": 32,
+            "d_mlp": 8192,
+            "n_layers": 16,
+            "n_ctx": 2048,  # capped due to memory issues
+            "eps": 1e-5,
+            "d_vocab": 128256,
+            "act_fn": "silu",
+            "n_key_value_heads": 8,
+            "normalization_type": "RMS",
+            "positional_embedding_type": "rotary",
+            "rotary_adjacent_pairs": False,
+            "rotary_dim": 64,
+            "final_rms": True,
+            "gated_mlp": True,
+        }
+    elif "Llama-3.2-3B-Instruct" in official_model_name:
+        cfg_dict = {
+            "d_model": 2048,
+            "d_head": 64,
+            "n_heads": 32,
+            "d_mlp": 8192,
+            "n_layers": 16,
+            "n_ctx": 2048,  # capped due to memory issues
+            "eps": 1e-5,
+            "d_vocab": 128256,
+            "act_fn": "silu",
+            "n_key_value_heads": 8,
+            "normalization_type": "RMS",
+            "positional_embedding_type": "rotary",
+            "rotary_adjacent_pairs": False,
+            "rotary_dim": 64,
+            "final_rms": True,
+            "gated_mlp": True,
+        }
     elif architecture == "GPTNeoForCausalLM":
         cfg_dict = {
             "d_model": hf_config.hidden_size,