Skip to content

Commit

Permalink
Merge pull request #93 from ssbuild/dev
Browse files Browse the repository at this point in the history
support chatglm3-6b-32k
  • Loading branch information
ssbuild authored Nov 22, 2023
2 parents 7464312 + 1af4fb7 commit 36c9db2
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 3 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ pip install -U git+https://github.com/ssbuild/deep_training.git --no-deps --forc
## update
- <strong>2023-11-13</strong>
- 0.2.9 release
- 0.2.9.post0 support chatglm3-6b-32k

- <strong>2023-10-22</strong>
- 0.2.7
- support clip 完整训练 https://github.com/ssbuild/clip_finetuning
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
]
setup(
name='deep_training',
version='0.2.9',
version='0.2.9.post0',
description='an easy training architecture',
long_description='torch_training: https://github.com/ssbuild/deep_training.git',
license='Apache License 2.0',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def __init__(
classifier_dropout=None,
attention_dropout=0.0,
layernorm_epsilon=1e-5,
rope_ratio=1,
rmsnorm=True,
apply_residual_connection_post_layernorm=False,
post_layer_norm=True,
Expand Down Expand Up @@ -44,6 +45,7 @@ def __init__(
self.classifier_dropout = classifier_dropout
self.attention_dropout = attention_dropout
self.layernorm_epsilon = layernorm_epsilon
self.rope_ratio = rope_ratio
self.rmsnorm = rmsnorm
self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
self.post_layer_norm = post_layer_norm
Expand Down
7 changes: 5 additions & 2 deletions src/deep_training/nlp/models/chatglm3/modeling_chatglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,13 @@ def split_tensor_along_last_dim(


class RotaryEmbedding(nn.Module):
def __init__(self, dim, original_impl=False, device=None, dtype=None):
def __init__(self, dim, rope_ratio=1, original_impl=False, device=None, dtype=None):
super().__init__()
inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
self.register_buffer("inv_freq", inv_freq)
self.dim = dim
self.original_impl = original_impl
self.rope_ratio = rope_ratio

def forward_impl(
self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
Expand All @@ -139,6 +140,7 @@ def forward_impl(
https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
"""
# $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
base = base * self.rope_ratio
theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem))

# Create position indexes `[0, 1, ..., seq_len - 1]`
Expand Down Expand Up @@ -754,7 +756,8 @@ def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
)

self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device,
self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio,
original_impl=config.original_rope, device=device,
dtype=config.torch_dtype)
self.encoder = init_method(GLMTransformer, config, **init_kwargs)
self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
Expand Down

0 comments on commit 36c9db2

Please sign in to comment.