Skip to content

Commit

Permalink
fix sample_rate issues
Browse files Browse the repository at this point in the history
  • Loading branch information
kevinhu-nv committed Nov 26, 2024
1 parent 15bb6b7 commit 19022e6
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 3 deletions.
4 changes: 4 additions & 0 deletions nemo/collections/multimodal/speech_llm/data/build_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,10 @@ def build_speechllm_dataset(model_instance, data_cfg, is_train):
speech_eos_id=data_cfg.get('speech_eos_id', 1004),
filter_by_source_target_text_ratio=data_cfg.get('filter_by_source_target_text_ratio', False),
source_target_text_ratio_limit=data_cfg.get('source_target_text_ratio_limit', 1.0),
load_answer_audio=data_cfg.get('load_answer_audio', False),
codec_model_downsampling_factor=data_cfg.get('codec_model_downsampling_factor', 1024),
sample_rate=data_cfg.get('sample_rate', 16000),
codec_sample_rate=data_cfg.get('target_audio_sample_rate', 22050),
)

# Notably, the data weights are controlled by either bucketing_weights
Expand Down
8 changes: 5 additions & 3 deletions nemo/collections/multimodal/speech_llm/data/lhotse_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ def __init__(
speech_eos_id: int = 1004,
filter_by_source_target_text_ratio: bool = False,
source_target_text_ratio_limit: float = 1.0,
sample_rate: int = 22050,
sample_rate: int = 16000,
codec_sample_rate: int = 22050,
t5_style: bool = False,
load_answer_audio: bool = False,
codec_model_downsampling_factor: float = 1023.5,
Expand Down Expand Up @@ -95,6 +96,7 @@ def __init__(
self.filter_by_source_target_text_ratio = filter_by_source_target_text_ratio
self.source_target_text_ratio_limit = source_target_text_ratio_limit
self.sample_rate = sample_rate
self.codec_sample_rate = codec_sample_rate
self.load_answer_audio = load_answer_audio
self.codec_model_downsampling_factor = codec_model_downsampling_factor

Expand Down Expand Up @@ -338,7 +340,7 @@ def collate_and_pad(inputs):
answer_audios = []
features_lens = []
for i, cut in enumerate(cuts):
answer_audio = torch.tensor(cut.target_audio.load_audio()).float()
answer_audio = torch.tensor(cut.target_audio.resample(self.codec_sample_rate).load_audio()).float()
answer_audio_len = torch.tensor(answer_audio.shape[1]).long()
answer_audios.append(answer_audio)
answer_audio_lens.append(answer_audio_len)
Expand Down Expand Up @@ -433,7 +435,7 @@ def _convert_text_to_3d_tensor(texts, include_eos=True, tokens_to_generate=0):
word_lengths,
start_time_tokens,
features_lens + 1,
self.codec_model_downsampling_factor / self.sample_rate,
self.codec_model_downsampling_factor / self.codec_sample_rate,
pad_id=text_unk_id,
)
else:
Expand Down

0 comments on commit 19022e6

Please sign in to comment.