From 5532f4970f58d311945d554517936b1771a74439 Mon Sep 17 00:00:00 2001 From: Adesoji Alu Date: Tue, 7 Jan 2025 11:51:23 +0100 Subject: [PATCH 1/3] Update preprocess_text.py Fix for this warning: NeMo-text-processing :: INFO :: Creating ClassifyFst grammars. 0%| | 0/8 [00:00 --- .../dataset_processing/tts/preprocess_text.py | 43 ++++++++++++++++--- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/scripts/dataset_processing/tts/preprocess_text.py b/scripts/dataset_processing/tts/preprocess_text.py index 6afab42a1d6b..f2ad950531ec 100644 --- a/scripts/dataset_processing/tts/preprocess_text.py +++ b/scripts/dataset_processing/tts/preprocess_text.py @@ -26,7 +26,8 @@ --num_workers=4 \ --joblib_batch_size=16 """ - +import os +import json import argparse from pathlib import Path @@ -102,14 +103,44 @@ def _process_entry( text = entry[text_key] if normalizer is not None: - if lower_case_norm: - text = text.lower() - text = normalizer.normalize(text, punct_pre_process=True, punct_post_process=True) + # Define additional split symbols to enhance splitting + additional_split_symbols = ";|:" # Adjust based on your dataset's characteristics + + # Split text into sentences using additional split symbols + sentences = normalizer.split_text_into_sentences(text, additional_split_symbols=additional_split_symbols) + + # Further split sentences longer than 500 words + split_sentences = [] + for sentence in sentences: + words = sentence.split() + if len(words) > 500: + # Split into chunks of 500 words + for i in range(0, len(words), 500): + chunk = ' '.join(words[i:i+500]) + split_sentences.append(chunk) + else: + split_sentences.append(sentence) + + # Log sentences exceeding 500 words (for debugging) + for idx, sentence in enumerate(split_sentences): + word_count = len(sentence.split()) + if word_count > 500: + print(f"Warning: Sentence {idx} with {word_count} words is still too long.") + + # Normalize each sentence individually + normalized_sentences = [ + normalizer.normalize(sentence, punct_pre_process=True, punct_post_process=True) + for sentence in split_sentences + ] + # Concatenate normalized sentences + normalized_text = ' '.join(normalized_sentences) + else: + normalized_text = text if lower_case: - text = text.lower() + normalized_text = normalized_text.lower() - entry[normalized_text_key] = text + entry[normalized_text_key] = normalized_text return entry From 573682f76609337ebcf6aa79bbf172e23ea6533e Mon Sep 17 00:00:00 2001 From: Adesoji1 Date: Tue, 7 Jan 2025 10:52:21 +0000 Subject: [PATCH 2/3] Apply isort and black reformatting Signed-off-by: Adesoji1 --- .../dataset_processing/tts/preprocess_text.py | 39 +++++++++++++------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/scripts/dataset_processing/tts/preprocess_text.py b/scripts/dataset_processing/tts/preprocess_text.py index f2ad950531ec..ff36b1415807 100644 --- a/scripts/dataset_processing/tts/preprocess_text.py +++ b/scripts/dataset_processing/tts/preprocess_text.py @@ -26,9 +26,9 @@ --num_workers=4 \ --joblib_batch_size=16 """ -import os -import json import argparse +import json +import os from pathlib import Path from hydra.utils import instantiate @@ -50,13 +50,20 @@ def get_args(): parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Process and normalize text data.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="Process and normalize text data.", ) parser.add_argument( - "--input_manifest", required=True, type=Path, help="Path to input training manifest.", + "--input_manifest", + required=True, + type=Path, + help="Path to input training manifest.", ) parser.add_argument( - "--output_manifest", required=True, type=Path, help="Path to output training manifest with processed text.", + "--output_manifest", + required=True, + type=Path, + help="Path to output training manifest with processed text.", ) parser.add_argument( "--overwrite", @@ -64,13 +71,21 @@ def get_args(): help="Whether to overwrite the output manifest file if it exists.", ) parser.add_argument( - "--text_key", default="text", type=str, help="Input text field to normalize.", + "--text_key", + default="text", + type=str, + help="Input text field to normalize.", ) parser.add_argument( - "--normalized_text_key", default="normalized_text", type=str, help="Output field to save normalized text to.", + "--normalized_text_key", + default="normalized_text", + type=str, + help="Output field to save normalized text to.", ) parser.add_argument( - "--lower_case", action=argparse.BooleanOptionalAction, help="Whether to convert the final text to lower case.", + "--lower_case", + action=argparse.BooleanOptionalAction, + help="Whether to convert the final text to lower case.", ) parser.add_argument( "--normalizer_config_path", @@ -108,7 +123,7 @@ def _process_entry( # Split text into sentences using additional split symbols sentences = normalizer.split_text_into_sentences(text, additional_split_symbols=additional_split_symbols) - + # Further split sentences longer than 500 words split_sentences = [] for sentence in sentences: @@ -116,11 +131,11 @@ def _process_entry( if len(words) > 500: # Split into chunks of 500 words for i in range(0, len(words), 500): - chunk = ' '.join(words[i:i+500]) + chunk = ' '.join(words[i : i + 500]) split_sentences.append(chunk) else: split_sentences.append(sentence) - + # Log sentences exceeding 500 words (for debugging) for idx, sentence in enumerate(split_sentences): word_count = len(sentence.split()) @@ -129,7 +144,7 @@ def _process_entry( # Normalize each sentence individually normalized_sentences = [ - normalizer.normalize(sentence, punct_pre_process=True, punct_post_process=True) + normalizer.normalize(sentence, punct_pre_process=True, punct_post_process=True) for sentence in split_sentences ] # Concatenate normalized sentences From 8a4e3cf439c60cd05006de58b046a999d9e412ee Mon Sep 17 00:00:00 2001 From: Adesoji Alu Date: Tue, 7 Jan 2025 21:23:04 +0100 Subject: [PATCH 3/3] Update preprocess_text.py Fix for this warning: NeMo-text-processing :: INFO :: Creating ClassifyFst grammars. 0%| | 0/8 [00:00 --- scripts/dataset_processing/tts/preprocess_text.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/dataset_processing/tts/preprocess_text.py b/scripts/dataset_processing/tts/preprocess_text.py index ff36b1415807..a47a1b9a4f33 100644 --- a/scripts/dataset_processing/tts/preprocess_text.py +++ b/scripts/dataset_processing/tts/preprocess_text.py @@ -27,8 +27,6 @@ --joblib_batch_size=16 """ import argparse -import json -import os from pathlib import Path from hydra.utils import instantiate