Merge pull request #93 from ssbuild/dev

support chatglm3-6b-32k
ssbuild · Nov 22, 2023 · 36c9db2 · 36c9db2
2 parents 7464312 + 1af4fb7
commit 36c9db2
Show file tree

Hide file tree

Showing 4 changed files with 10 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -19,6 +19,8 @@ pip install -U git+https://github.com/ssbuild/deep_training.git --no-deps --forc
 ## update
 - <strong>2023-11-13</strong>
   - 0.2.9 release
+  - 0.2.9.post0 support chatglm3-6b-32k
+
 - <strong>2023-10-22</strong>
   - 0.2.7
     - support clip 完整训练 https://github.com/ssbuild/clip_finetuning 

diff --git a/setup.py b/setup.py
@@ -20,7 +20,7 @@
 ]
 setup(
     name='deep_training',
-    version='0.2.9',
+    version='0.2.9.post0',
     description='an easy training architecture',
     long_description='torch_training: https://github.com/ssbuild/deep_training.git',
     license='Apache License 2.0',

diff --git a/src/deep_training/nlp/models/chatglm3/configuration_chatglm.py b/src/deep_training/nlp/models/chatglm3/configuration_chatglm.py
@@ -16,6 +16,7 @@ def __init__(
         classifier_dropout=None,
         attention_dropout=0.0,
         layernorm_epsilon=1e-5,
+        rope_ratio=1,
         rmsnorm=True,
         apply_residual_connection_post_layernorm=False,
         post_layer_norm=True,
@@ -44,6 +45,7 @@ def __init__(
         self.classifier_dropout = classifier_dropout
         self.attention_dropout = attention_dropout
         self.layernorm_epsilon = layernorm_epsilon
+        self.rope_ratio = rope_ratio
         self.rmsnorm = rmsnorm
         self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
         self.post_layer_norm = post_layer_norm

diff --git a/src/deep_training/nlp/models/chatglm3/modeling_chatglm.py b/src/deep_training/nlp/models/chatglm3/modeling_chatglm.py
@@ -122,12 +122,13 @@ def split_tensor_along_last_dim(
 
 
 class RotaryEmbedding(nn.Module):
-    def __init__(self, dim, original_impl=False, device=None, dtype=None):
+    def __init__(self, dim, rope_ratio=1, original_impl=False, device=None, dtype=None):
         super().__init__()
         inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
         self.register_buffer("inv_freq", inv_freq)
         self.dim = dim
         self.original_impl = original_impl
+        self.rope_ratio = rope_ratio
 
     def forward_impl(
             self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
@@ -139,6 +140,7 @@ def forward_impl(
         https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
         """
         # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+        base = base * self.rope_ratio
         theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem))
 
         # Create position indexes `[0, 1, ..., seq_len - 1]`
@@ -754,7 +756,8 @@ def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
             config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
         )
 
-        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device,
+        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio,
+                                              original_impl=config.original_rope, device=device,
                                               dtype=config.torch_dtype)
         self.encoder = init_method(GLMTransformer, config, **init_kwargs)
         self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,