Skip to content

Commit

Permalink
fix pattern and conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
Ali Taghibakhshi committed Dec 2, 2024
1 parent edba56a commit 0935784
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 70 deletions.
33 changes: 16 additions & 17 deletions nemo/collections/llm/gpt/model/hyena.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,31 +161,30 @@ def transform_source_dict(self, source):
import re

layer_map = {i + 2: i for i in range(self.num_layers)}
layer_map[self.num_layers + 3] = self.num_layers + 1
layer_map[self.num_layers + 3] = self.num_layers
updated_data = {}

for key in list(source.keys()):
for key in list(source['module'].keys()):
if "_extra" in key:
source.pop(key)
source['module'].pop(key)
else:
match = re.search(r'sequential\.(\d+)', key)
if match:
original_layer_num = int(match.group(1))
if original_layer_num in layer_map:
# Create the updated key by replacing the layer number
new_key = re.sub(rf'\b{original_layer_num}\b', str(layer_map[original_layer_num]), key)
updated_data[new_key] = source[key]
updated_data[new_key] = source['module'][key]
else:
# Keep the key unchanged if no mapping exists
updated_data[key] = source[key]
updated_data[key] = source['module'][key]
else:
updated_data[key] = source[key]

updated_data[key] = source['module'][key]
return updated_data

source = ModelState(source, self.config.num_layers)
target = self.init()
trainer = self.nemo_setup(target, ckpt_async_save=False)
trainer = self.nemo_setup(target, ckpt_async_save=False, save_ckpt_format='zarr')
source.to(self.config.params_dtype)
target.to(self.config.params_dtype)
self.convert_state(source, target)
Expand All @@ -204,7 +203,7 @@ def convert_state(self, source, target):
te_enabled = True
scale_or_weight = 'weight'
mapping['sequential.0.word_embeddings.weight'] = 'embedding.word_embeddings.weight'
mapping[f'sequential.{len(self.config.hybrid_override_pattern)+1}.norm.{scale_or_weight}'] = (
mapping[f'sequential.{len(self.config.hybrid_override_pattern)}.norm.{scale_or_weight}'] = (
'decoder.final_norm.weight'
)
for i, symbol in enumerate(self.config.hybrid_override_pattern):
Expand Down Expand Up @@ -334,7 +333,7 @@ class HyenaTestConfig(HyenaConfig):

@dataclass
class Hyena7bConfig(HyenaConfig):
hybrid_override_pattern: str = "SDH*SHDSDH*SDHSDH*SDHSDH*SDHSDH*"
hybrid_override_pattern: str = "SDH*SDHSDH*SDHSDH*SDHSDH*SDHSDH*"
num_layers: int = 32
seq_length: int = 8192
hidden_size: int = 4096
Expand All @@ -355,9 +354,9 @@ class Hyena7bConfig(HyenaConfig):
add_qkv_bias: bool = False
add_bias_linear: bool = False
layernorm_epsilon: float = 1e-6
# fp8: str = 'hybrid'
# fp8_amax_history_len: int = 16
# fp8_amax_compute_algo: str = "max"
fp8: str = 'hybrid'
fp8_amax_history_len: int = 16
fp8_amax_compute_algo: str = "max"
recompute_granularity: str = 'full'
recompute_method: str = 'uniform'
recompute_num_layers: int = 4
Expand All @@ -367,7 +366,7 @@ class Hyena7bConfig(HyenaConfig):

@dataclass
class Hyena40bConfig(HyenaConfig):
hybrid_override_pattern: str = "SDH*SHDSDH*SDHSDH*SDHSDH*SDHSDH*SDH*SDHSDH*SDHSDH*"
hybrid_override_pattern: str = "SDH*SDHSDH*SDHSDH*SDHSDH*SDHSDH*SDH*SDHSDH*SDHSDH*"
num_layers: int = 50
seq_length: int = 8192
hidden_size: int = 8192
Expand All @@ -388,9 +387,9 @@ class Hyena40bConfig(HyenaConfig):
add_qkv_bias: bool = False
add_bias_linear: bool = False
layernorm_epsilon: float = 1e-6
# fp8: str = 'hybrid'
# fp8_amax_history_len: int = 16
# fp8_amax_compute_algo: str = "max"
fp8: str = 'hybrid'
fp8_amax_history_len: int = 16
fp8_amax_compute_algo: str = "max"
recompute_granularity: str = 'full'
recompute_method: str = 'uniform'
recompute_num_layers: int = 2
Expand Down
129 changes: 76 additions & 53 deletions tests/collections/llm/gpt/model/test_hyena.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,22 +30,24 @@
from nemo.lightning import NeMoLogger
from nemo.lightning.pytorch.callbacks import ModelCheckpoint
from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, WarmupHoldPolicyScheduler


"""
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc-per-node=8 /opt/NeMo/tests/collections/llm/gpt/model/test_hyena.py \
--num-nodes=1 \
--devices=8 \
--max-steps=1000 \
--val-check-interval=100 \
--max-steps=50000 \
--val-check-interval=10 \
--experiment-dir=/lustre/fsw/coreai_dlalgo_genai/ataghibakhsh/checkpoints/hyena_exp \
--ckpt-dir=/lustre/fsw/coreai_dlalgo_genai/ataghibakhsh/checkpoints/hyena_exp \
--data-path=/lustre/fsw/coreai_dlalgo_genai/ataghibakhsh/datasets/hyena_data/hg38/pretraining_data_hg38/data_hg38_all_text_CharLevelTokenizer_document \
--seq-length=8192 \
--tensor-parallel-size=4 \
--tensor-parallel-size=1 \
--pipeline-model-parallel-size=1 \
--context-parallel-size=2 \
--sequence-parallel \
--context-parallel-size=1 \
--global-batch-size=16 \
--micro-batch-size=4 \
--micro-batch-size=2 \
--model-size=7b
"""

Expand All @@ -69,7 +71,10 @@ def get_args():
'--model-size', type=str, default="7b", help="Model size, choose between 7b, 40b, or test (4 layers, less than 1b)"
)
parser.add_argument(
'--experiment-dir', type=str, default=None, help="directory to write results and checkpoints to"
'--experiment-dir', type=str, default=None, help="directory to write results to"
)
parser.add_argument(
'--ckpt-dir', type=str, default=None, help="directory to write checkpoints to"
)
parser.add_argument('--tokenizer-path', type=str, default=None, help="Path to tokenizer model")

Expand All @@ -84,23 +89,13 @@ def get_args():
"byte-level",
)

# data = MockDataModule(
# seq_length=args.seq_length,
# tokenizer=tokenizer,
# micro_batch_size=args.micro_batch_size,
# global_batch_size=args.global_batch_size,
# num_train_samples=10_000,
# num_val_samples=10,
# num_test_samples=10,
# num_workers=0,
# pin_memory=False,
# )
data = PreTrainingDataModule(
paths=args.data_path,
seq_length=args.seq_length,
micro_batch_size=args.micro_batch_size,
global_batch_size=args.global_batch_size,
seed=1234,
num_workers=2,
tokenizer=tokenizer,
)

Expand All @@ -115,73 +110,101 @@ def get_args():

hyena_config.seq_length = args.seq_length
model = llm.GPTModel(hyena_config, tokenizer=data.tokenizer)
strategy = nl.MegatronStrategy(
tensor_model_parallel_size=args.tensor_parallel_size,
pipeline_model_parallel_size=args.pipeline_model_parallel_size,
context_parallel_size=args.context_parallel_size,
pipeline_dtype=torch.bfloat16,
sequence_parallel=args.sequence_parallel,
ckpt_load_optimizer=False,
ckpt_save_optimizer=False,
ckpt_async_save=False,
)

checkpoint_callback = ModelCheckpoint(
every_n_train_steps=args.val_check_interval,
dirpath=args.experiment_dir,
save_top_k=5,
save_optim_on_train_end=True
)
callbacks = [checkpoint_callback]


loggers = []
wandb_logger = WandbLogger(
name=(f"hyena-size-{args.model_size}-TP{args.tensor_parallel_size}-"
f"PP{args.pipeline_model_parallel_size}-CP{args.context_parallel_size}"
f"-GBS{args.global_batch_size}-MBS{args.micro_batch_size}"),
project="hyena_ux",
project="hyena_ux_test",
save_dir=args.experiment_dir,
)
# wandb_logger = TensorBoardLogger(
# save_dir='dummy', ## NOTE: this gets overwritten by default
# )
loggers.append(wandb_logger)

opt_config = OptimizerConfig(
optimizer='adam',
lr=6e-4,
min_lr=6e-5,
clip_grad=1.0,
use_distributed_optimizer=True,
bf16=True,
nemo_logger = NeMoLogger(
log_dir=args.experiment_dir,
wandb=wandb_logger
)
opt = MegatronOptimizerModule(config=opt_config, no_weight_decay_cond=hyena_config.hyena_no_weight_decay_cond_fn)

trainer = nl.Trainer(
devices=args.devices,
num_nodes=args.num_nodes,
max_steps=args.max_steps,
accelerator="gpu",
strategy=strategy,
strategy = nl.MegatronStrategy(
tensor_model_parallel_size=args.tensor_parallel_size,
pipeline_model_parallel_size=args.pipeline_model_parallel_size,
context_parallel_size=args.context_parallel_size,
pipeline_dtype=torch.bfloat16,
sequence_parallel=args.sequence_parallel,
ckpt_load_optimizer=True,
ckpt_save_optimizer=True,
ckpt_async_save=False,
save_ckpt_format='zarr',
),
logger=loggers,
callbacks=callbacks,
callbacks = [checkpoint_callback],
log_every_n_steps=1,
limit_val_batches=2,
limit_val_batches=10,
num_sanity_val_steps=0,
plugins=nl.MegatronMixedPrecision(
precision="bf16-mixed",
params_dtype=torch.bfloat16,
),
val_check_interval=args.val_check_interval,
)

nemo_logger = NeMoLogger(
log_dir=args.experiment_dir,
# Logger setup
nemo_logger.setup(
trainer,
resume_if_exists=True,
)

app_state = _setup(
model=model,
data=data,
trainer=trainer,
log=nemo_logger,
optim=opt,
resume=None,
tokenizer='data',
model_transform=None,
# Auto resume setup
from nemo.lightning.pytorch.strategies.utils import RestoreConfig

resume = nl.AutoResume(
resume_if_exists=True,
resume_ignore_no_checkpoint=True,
resume_past_end=True,
resume_from_directory=args.ckpt_dir,
# restore_config=(
# RestoreConfig(
# path=args.ckpt_dir,
# load_model_state = True,
# load_optim_state = True,
# ) if args.ckpt_dir else None
# ),
)
resume.setup(trainer, model)

# Optimizer and scheduler setup
opt_config = OptimizerConfig(
optimizer='adam',
lr=0.0003,
adam_beta1=0.9,
adam_beta2=0.95,
use_distributed_optimizer=True,
bf16=True,
)
sched = CosineAnnealingScheduler(
max_steps=trainer.max_steps,
warmup_steps=2500,
min_lr=0.00003,
)

opt = MegatronOptimizerModule(opt_config, sched)
opt.connect(model)

# Start training
trainer.fit(model, data)

0 comments on commit 0935784

Please sign in to comment.