Skip to content

Commit

Permalink
Merge branch 'main' into vchen/neva-blend-data
Browse files Browse the repository at this point in the history
  • Loading branch information
yaoyu-33 authored Aug 7, 2024
2 parents 43da9ec + 7cae5c4 commit 61e2b15
Show file tree
Hide file tree
Showing 27 changed files with 1,039 additions and 177 deletions.
2 changes: 1 addition & 1 deletion Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ WORKDIR /workspace

# Install NeMo requirements
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG MODELOPT_VERSION=0.13.0
ARG MODELOPT_VERSION=0.15.0
ARG MCORE_TAG=2fd6e2b74efca73a1f2d27b89bb5419384b4d3bf
ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
RUN \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ model:
llm:
from_pretrained: null #path to nemo checkpoint
freeze: False
model_type: llama_2 # `nvgpt` or `llama_2` supported
model_type: llama_2 # `v1`, `nvgpt`, `llama_2`, `llama_3` and `mistral` supported
vision_encoder:
from_pretrained: "Lin-Chen/ShareGPT4V-13B_Pretrained_vit-large336-l12" # huggingface path or name
from_hf: True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ model:
llm:
from_pretrained: null # path to nemo checkpoint
freeze: True
model_type: llama_2 # `nvgpt` or `llama_2` supported
model_type: llama_2 # `v1`, `nvgpt`, `llama_2`, `llama_3` and `mistral` supported
vision_encoder:
from_pretrained: "" # path or name
from_hf: True
Expand Down
220 changes: 220 additions & 0 deletions examples/multimodal/multimodal_llm/neva/conf/neva_mixtral_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
trainer:
devices: 1
num_nodes: 1
accelerator: gpu
precision: bf16
logger: False # logger provided by exp_manager
enable_checkpointing: False
use_distributed_sampler: False
max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
max_steps: 4650 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 10
val_check_interval: 100
check_val_every_n_epoch: null
limit_val_batches: 50
limit_test_batches: 500
accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
gradient_clip_val: 1.0
benchmark: False
enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually

exp_manager:
explicit_log_dir: null
exp_dir: null
name: nemo_neva
create_wandb_logger: False
wandb_logger_kwargs:
project: null
name: null
resume_if_exists: True
resume_ignore_no_checkpoint: True
resume_from_checkpoint: ${model.resume_from_checkpoint}
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 10
mode: min
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
save_nemo_on_train_end: True # not recommended when training large models on clusters with short time limits
filename: 'megatron_clip--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
ema:
enable: False
decay: 0.9999
validate_original_weights: False
every_n_steps: 1
cpu_offload: False

model:
precision: ${trainer.precision}

# specify micro_batch_size, global_batch_size, and model parallelism
# gradient accumulation will be done automatically based on data_parallel_size

# Batch size guideline for different types of dataset
micro_batch_size: 1 # limited by GPU memory
global_batch_size: 1 # will use more micro batches to reach global batch size

tensor_model_parallel_size: 1 # intra-layer model parallelism
pipeline_model_parallel_size: 1 # inter-layer model parallelism
expert_model_parallel_size: 1
context_parallel_size: 1
virtual_pipeline_model_parallel_size: null # interleaved pipeline

restore_from_path: null # used in fine-tuning

# Multimodal configs
mm_cfg:
llm:
from_pretrained: null
freeze: True
model_type: mistral # `v1`, `nvgpt`, `llama_2`, `llama_3` and `mistral` supported
vision_encoder:
from_pretrained: 'google/siglip-so400m-patch14-384' # path or name
from_hf: True
patch_dim: 14
crop_size: [384, 384]
hidden_size: 1152 # could be found from model but tricky in code
vision_select_layer: -2 # default to the last layer
class_token_length: 0
freeze: True
pretrain_mm_mlp_adapter: null # path to pretrained mm adapter
mm_mlp_adapter_type: mlp_downsample
use_im_start_end: False


# LLM configs
# use GPTModel from megatron.core
mcore_gpt: True

moe_grouped_gemm: False
moe_token_dispatcher_type: alltoall
moe_aux_loss_coeff: 0.01
# model architecture
encoder_seq_length: 4096
max_position_embeddings: 32768
position_embedding_type: rope
num_layers: 32
hidden_size: 4096
ffn_hidden_size: 14336 # Transformer FFN hidden size. Usually 4 * hidden_size.
num_attention_heads: 32
init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
use_scaled_init_method: True # use scaled residuals initialization
hidden_dropout: 0.0 # Dropout probability for hidden state transformer.
attention_dropout: 0.0 # Dropout probability for attention
ffn_dropout: 0.0 # Dropout probability in the feed-forward layer.
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
apply_query_key_layer_scaling: False # scale Q * K^T by 1 / layer-number.
normalization: rmsnorm # Type of normalization layers
layernorm_epsilon: 1.0e-05
do_layer_norm_weight_decay: False # True means weight decay on all params
make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
pre_process: True # add embedding
post_process: True # add pooler
persist_layer_norm: True # Use of persistent fused layer norm kernel.
bias: False # Whether to use bias terms in all weight matrices.
activation: 'fast-swiglu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu']
headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this.
rotary_base: 1000000.0
moe_router_topk: 2
num_moe_experts: 8
attention_type: 'multihead' # Attention type. Options ['multihead']
share_embeddings_and_output_weights: False # Share embedding and output layer weights.
overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595.
num_query_groups: 8 # Number of query groups for group query attention. If None, normal attention is used.
use_flash_attention: True

## Activation Checkpointing
activations_checkpoint_granularity: null # 'selective' or 'full'
activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
activations_checkpoint_num_layers: null # not used with 'selective'
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
sequence_parallel: True

# precision
native_amp_init_scale: 4294967296 # 2 ** 32
native_amp_growth_interval: 1000
hysteresis: 2 # Gradient scale hysteresis
fp32_residual_connection: False # Move residual connections to fp32
fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16

# model fusions
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.

use_cpu_initialization: False # Init weights on the CPU (slow for large models)
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism.
openai_gelu: False
bias_activation_fusion: False
megatron_legacy: False

transformer_engine: True
fp8: False # enables fp8 in TransformerLayer forward
fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
fp8_margin: 0 # scaling margin
fp8_interval: 1 # scaling update interval
fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor
fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history
use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.

# Megatron O2-style half-precision
megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
async_grad_allreduce: False
grad_allreduce_chunk_size_mb: 125
grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce

# miscellaneous
seed: 1234
resume_from_checkpoint: null # manually set the checkpoint file to load from
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)

tokenizer:
library: 'sentencepiece'
type: null
model: null
vocab_file: null
merge_file: null
delimiter: null # only used for tabular tokenizer
sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.

data:
num_workers: 1
dataloader_type: cyclic
data_path: null
lazy_preprocess: True
is_multimodal: True
media_type: image
sep_image_conv_front: False
conv_template: mistral
image_folder: null
image_aspect_ratio: 'square'

# Nsys profiling options
nsys_profile:
enabled: False
start_step: 10 # Global batch to start profiling
end_step: 10 # Global batch to end profiling
ranks: [ 0 ] # Global rank IDs to profile
gen_shape: False # Generate model and kernel details including input shapes

optim:
name: fused_adam
lr: 1e-3
weight_decay: 0.
betas:
- 0.9
- 0.95
sched:
name: CosineAnnealing
warmup_steps: 70
constant_steps: 0
min_lr: 2e-5
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ model:
llm:
from_pretrained: #path to nemo checkpoint
freeze: True
model_type: llama_2 # `nvgpt` or `llama_2` supported
model_type: llama_2 # `v1`, `nvgpt`, `llama_2`, `llama_3` and `mistral` supported
vision_encoder:
from_pretrained: "" # path or name
from_hf: True
Expand Down
43 changes: 26 additions & 17 deletions nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from lhotse.cut import MixedCut, MonoCut
from lhotse.dataset import AudioSamples
from lhotse.dataset.collation import collate_vectors
from lhotse.utils import ifnone

from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper
from nemo.collections.common.prompts.canary import CanaryPromptFormatter
Expand Down Expand Up @@ -186,30 +187,38 @@ def canary(
f"Please ensure that every utterance in the input manifests contains these keys."
)

encoded = formatter.encode_dialog(
turns=[
dict(
role="user",
slots={
**{slot: cut.custom[slot] for slot in expected_slots},
formatter.PROMPT_LANGUAGE_SLOT: CANARY_SPECIAL_TOKENIZER,
},
),
turns = [
dict(
role="user",
slots={
**{slot: cut.custom[slot] for slot in expected_slots},
formatter.PROMPT_LANGUAGE_SLOT: CANARY_SPECIAL_TOKENIZER,
},
)
]
if text := ' '.join(s.text for s in cut.supervisions if s.text is not None):
# Create answer_ids only if there is some transcript in the data.
turns.extend(
dict(
role="assistant",
slots={
"text": ' '.join(s.text for s in cut.supervisions),
formatter.PROMPT_LANGUAGE_SLOT: cut.supervisions[0].language,
"text": text,
formatter.PROMPT_LANGUAGE_SLOT: ifnone(
cut.supervisions[0].language, cut.custom.get("target_lang")
),
},
),
]
)
)
encoded = formatter.encode_dialog(turns)
prompts_with_answers.append(encoded["input_ids"])
prompts.append(encoded["context_ids"])
assert (
encoded["answer_ids"][-1].item() == formatter.tokenizer.eos
), f"Expected the last token in answer_ids to be EOS, but we got {encoded['answer_ids']=}"
answers.append(encoded["answer_ids"][:-1]) # Strip Canary's EOS
if "answer_ids" in encoded:
assert (
encoded["answer_ids"][-1].item() == formatter.tokenizer.eos
), f"Expected the last token in answer_ids to be EOS, but we got {encoded['answer_ids']=}"
answers.append(encoded["answer_ids"][:-1]) # Strip Canary's EOS
else:
answers.append([])

return prompts_with_answers, prompts, answers

Expand Down
5 changes: 1 addition & 4 deletions nemo/collections/asr/models/aed_multitask_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -838,7 +838,7 @@ def _transcribe_forward(
# Handling regular Canary DataLoader
audio = batch.audio
audio_lens = batch.audio_lens
decoder_input_ids = batch.prompted_transcript
decoder_input_ids = batch.prompt
else:
# Handling TensorDataset / external DataLoader
audio, audio_lens = batch[0], batch[1]
Expand Down Expand Up @@ -999,13 +999,10 @@ def _may_be_make_dict_and_fix_paths(self, json_items, manifest_path, trcfg: Mult
entry = {
'audio_filepath': item,
'duration': 100000,
trcfg.text_field: 'nothing',
}
elif isinstance(item, dict):
entry = item
entry['audio_filepath'] = get_full_path(entry['audio_filepath'], manifest_file=manifest_path)
if trcfg.text_field not in entry:
entry[trcfg.text_field] = 'nothing'
else:
raise ValueError(f"Expected str or dict, got {type(item)}")
default_turn = [t for t in trcfg.prompt if t["role"] == "user"]
Expand Down
9 changes: 7 additions & 2 deletions nemo/collections/asr/parts/mixins/transcription.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import tempfile
from abc import ABC, abstractmethod
from collections.abc import Iterable
from dataclasses import dataclass
from dataclasses import dataclass, fields, is_dataclass
from functools import partial
from typing import Any, Dict, List, Optional, Tuple, Union

Expand Down Expand Up @@ -69,13 +69,18 @@ class TranscribeConfig:
def move_to_device(batch, device, non_blocking=False):
"""
Recursively move all tensors in `batch` to `device`.
Supports tensors, lists, tuples, dictionaries, and dataclasses.
"""
if isinstance(batch, torch.Tensor):
return batch.to(device, non_blocking=non_blocking)
elif isinstance(batch, (list, tuple)):
return [move_to_device(x, device, non_blocking) for x in batch]
return type(batch)(move_to_device(x, device, non_blocking) for x in batch)
elif isinstance(batch, dict):
return {k: move_to_device(v, device, non_blocking) for k, v in batch.items()}
elif is_dataclass(batch):
return type(batch)(
**{field.name: move_to_device(getattr(batch, field.name), device, non_blocking) for field in fields(batch)}
)
else:
return batch # do nothing if not supported type

Expand Down
Loading

0 comments on commit 61e2b15

Please sign in to comment.