From 209f1ddabf38a0bef563e02dc73ca375a951a82b Mon Sep 17 00:00:00 2001 From: Sourabh Medapati Date: Thu, 27 Jul 2023 23:57:33 +0000 Subject: [PATCH 1/5] librispeech dataset script fixes --- datasets/dataset_setup.py | 46 +++++++++++++++++++----------- datasets/librispeech_preprocess.py | 17 ----------- datasets/librispeech_tokenizer.py | 22 +++----------- 3 files changed, 33 insertions(+), 52 deletions(-) diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py index bc4502a24..d4c9af73c 100644 --- a/datasets/dataset_setup.py +++ b/datasets/dataset_setup.py @@ -169,9 +169,10 @@ 'The number of threads to use in parallel when decompressing.') flags.DEFINE_string('framework', None, 'Can be either jax or pytorch.') -flags.DEFINE_boolean('train_tokenizer', True, 'Train Librispeech tokenizer.') FLAGS = flags.FLAGS +os.environ["CUDA_VISIBLE_DEVICES"] = "-1" +tf.config.set_visible_devices([], 'GPU') def _maybe_mkdir(d): if not os.path.exists(d): @@ -458,17 +459,26 @@ def download_imagenet_v2(data_dir): data_dir=data_dir).download_and_prepare() -def download_librispeech(dataset_dir, tmp_dir, train_tokenizer): +def download_librispeech(dataset_dir, tmp_dir): # After extraction the result is a folder named Librispeech containing audio # files in .flac format along with transcripts containing name of audio file # and corresponding transcription. - tmp_librispeech_dir = os.path.join(tmp_dir, 'LibriSpeech') + tmp_librispeech_dir = os.path.join(tmp_dir, 'librispeech') + extracted_data_dir = os.path.join(tmp_librispeech_dir, 'LibriSpeech') + final_data_dir = os.path.join(dataset_dir, 'librispeech_processed') + _maybe_mkdir(tmp_librispeech_dir) for split in ['dev', 'test']: for version in ['clean', 'other']: - wget_cmd = f'wget http://www.openslr.org/resources/12/{split}-{version}.tar.gz -O - | tar xz' # pylint: disable=line-too-long - subprocess.Popen(wget_cmd, shell=True, cwd=tmp_dir).communicate() + wget_cmd = ( + f'wget --directory-prefix={tmp_librispeech_dir} ' + f'http://www.openslr.org/resources/12/{split}-{version}.tar.gz') + subprocess.Popen(wget_cmd, shell=True).communicate() + tar_path = os.path.join(tmp_librispeech_dir, f'{split}-{version}.tar.gz') + subprocess.Popen( + f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', shell=True).communicate() + tars = [ 'raw-metadata.tar.gz', @@ -477,19 +487,21 @@ def download_librispeech(dataset_dir, tmp_dir, train_tokenizer): 'train-other-500.tar.gz', ] for tar_filename in tars: - wget_cmd = f'wget http://www.openslr.org/resources/12/{tar_filename} -O - | tar xz ' # pylint: disable=line-too-long - subprocess.Popen(wget_cmd, shell=True, cwd=tmp_dir).communicate() + wget_cmd = (f'wget --directory-prefix={tmp_librispeech_dir} ' + f'http://www.openslr.org/resources/12/{tar_filename}') + subprocess.Popen(wget_cmd, shell=True).communicate() + tar_path = os.path.join(tmp_librispeech_dir, tar_filename) + subprocess.Popen(f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', shell=True).communicate() + + tokenizer_vocab_path = os.path.join(extracted_data_dir, 'spm_model.vocab') - if train_tokenizer: - tokenizer_vocab_path = librispeech_tokenizer.run( - train=True, data_dir=tmp_librispeech_dir) + if not os.path.exists(tokenizer_vocab_path): + librispeech_tokenizer.run(train=True, data_dir=extracted_data_dir) - # Preprocess data. - librispeech_dir = os.path.join(dataset_dir, 'librispeech') - librispeech_preprocess.run( - input_dir=tmp_librispeech_dir, - output_dir=librispeech_dir, - tokenizer_vocab_path=tokenizer_vocab_path) + librispeech_preprocess.run( + input_dir=extracted_data_dir, + output_dir=final_data_dir, + tokenizer_vocab_path=tokenizer_vocab_path) def download_mnist(data_dir): @@ -577,7 +589,7 @@ def main(_): if FLAGS.all or FLAGS.librispeech: logging.info('Downloading Librispeech...') - download_librispeech(data_dir, tmp_dir, train_tokenizer=True) + download_librispeech(data_dir, tmp_dir) if FLAGS.all or FLAGS.cifar: logging.info('Downloading CIFAR...') diff --git a/datasets/librispeech_preprocess.py b/datasets/librispeech_preprocess.py index 2ce8d79ca..b938c8428 100644 --- a/datasets/librispeech_preprocess.py +++ b/datasets/librispeech_preprocess.py @@ -9,7 +9,6 @@ import threading import time -from absl import flags from absl import logging import numpy as np import pandas as pd @@ -23,14 +22,6 @@ exists = tf.io.gfile.exists rename = tf.io.gfile.rename -flags.DEFINE_string('raw_input_dir', - '', - 'Path to the raw training data directory.') -flags.DEFINE_string('output_dir', '', 'Dir to write the processed data to.') -flags.DEFINE_string('tokenizer_vocab_path', - '', - 'Path to sentence piece tokenizer vocab file.') -FLAGS = flags.FLAGS TRANSCRIPTION_MAX_LENGTH = 256 AUDIO_MAX_LENGTH = 320000 @@ -178,11 +169,3 @@ def run(input_dir, output_dir, tokenizer_vocab_path): 'expected count: {} vs expected {}'.format( num_entries, librispeech_example_counts[subset])) example_ids.to_csv(os.path.join(output_dir, f'{subset}.csv')) - - -def main(): - run(FLAGS.input_dir, FLAGS.output_dir, FLAGS.tokenizer_vocab_path) - - -if __name__ == '__main__': - main() diff --git a/datasets/librispeech_tokenizer.py b/datasets/librispeech_tokenizer.py index 71aa719c2..e701d59d4 100644 --- a/datasets/librispeech_tokenizer.py +++ b/datasets/librispeech_tokenizer.py @@ -8,7 +8,6 @@ import tempfile from typing import Dict -from absl import flags from absl import logging import sentencepiece as spm import tensorflow as tf @@ -21,13 +20,6 @@ Features = Dict[str, tf.Tensor] -flags.DEFINE_string('input_dir', '', 'Path to training data directory.') -flags.DEFINE_boolean( - 'train', - False, - 'Whether to train a new tokenizer or load existing one to test.') -FLAGS = flags.FLAGS - def dump_chars_for_training(data_folder, splits, maxchars: int = int(1e7)): char_count = 0 @@ -118,13 +110,15 @@ def load_tokenizer(model_filepath): def run(train, data_dir): logging.info('Data dir: %s', data_dir) + vocab_path = os.path.join(data_dir, 'spm_model.vocab') + logging.info('vocab_path = ', vocab_path) if train: logging.info('Training...') splits = ['train-clean-100'] - return train_tokenizer(data_dir, splits) + train_tokenizer(data_dir, splits, model_path=vocab_path) else: - tokenizer = load_tokenizer(os.path.join(data_dir, 'spm_model.vocab')) + tokenizer = load_tokenizer(vocab_path) test_input = 'OPEN SOURCE ROCKS' tokens = tokenizer.tokenize(test_input) detokenized = tokenizer.detokenize(tokens).numpy().decode('utf-8') @@ -135,11 +129,3 @@ def run(train, data_dir): if detokenized == test_input: logging.info('Tokenizer working correctly!') - - -def main(): - run(FLAGS.train, FLAGS.data_dir) - - -if __name__ == '__main__': - main() From f4315368bbd671085e835fee240202900dd1f7bf Mon Sep 17 00:00:00 2001 From: Sourabh Medapati Date: Thu, 27 Jul 2023 23:59:56 +0000 Subject: [PATCH 2/5] identation fix --- datasets/dataset_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py index d4c9af73c..37e49cd84 100644 --- a/datasets/dataset_setup.py +++ b/datasets/dataset_setup.py @@ -487,7 +487,7 @@ def download_librispeech(dataset_dir, tmp_dir): 'train-other-500.tar.gz', ] for tar_filename in tars: - wget_cmd = (f'wget --directory-prefix={tmp_librispeech_dir} ' + wget_cmd = (f'wget --directory-prefix={tmp_librispeech_dir} ' f'http://www.openslr.org/resources/12/{tar_filename}') subprocess.Popen(wget_cmd, shell=True).communicate() tar_path = os.path.join(tmp_librispeech_dir, tar_filename) From 2d0f5d46332fd5cb3e99fbb8033e47e618110bea Mon Sep 17 00:00:00 2001 From: Sourabh Medapati Date: Fri, 28 Jul 2023 00:08:47 +0000 Subject: [PATCH 3/5] yapf fixes --- datasets/dataset_setup.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py index 37e49cd84..8bfc8e4f2 100644 --- a/datasets/dataset_setup.py +++ b/datasets/dataset_setup.py @@ -174,6 +174,7 @@ os.environ["CUDA_VISIBLE_DEVICES"] = "-1" tf.config.set_visible_devices([], 'GPU') + def _maybe_mkdir(d): if not os.path.exists(d): os.makedirs(d) @@ -466,7 +467,7 @@ def download_librispeech(dataset_dir, tmp_dir): tmp_librispeech_dir = os.path.join(tmp_dir, 'librispeech') extracted_data_dir = os.path.join(tmp_librispeech_dir, 'LibriSpeech') final_data_dir = os.path.join(dataset_dir, 'librispeech_processed') - + _maybe_mkdir(tmp_librispeech_dir) for split in ['dev', 'test']: @@ -477,7 +478,8 @@ def download_librispeech(dataset_dir, tmp_dir): subprocess.Popen(wget_cmd, shell=True).communicate() tar_path = os.path.join(tmp_librispeech_dir, f'{split}-{version}.tar.gz') subprocess.Popen( - f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', shell=True).communicate() + f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', + shell=True).communicate() tars = [ @@ -491,7 +493,9 @@ def download_librispeech(dataset_dir, tmp_dir): f'http://www.openslr.org/resources/12/{tar_filename}') subprocess.Popen(wget_cmd, shell=True).communicate() tar_path = os.path.join(tmp_librispeech_dir, tar_filename) - subprocess.Popen(f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', shell=True).communicate() + subprocess.Popen( + f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', + shell=True).communicate() tokenizer_vocab_path = os.path.join(extracted_data_dir, 'spm_model.vocab') From 32eab1255458eb290924fc527ddd7487cdfd7bf5 Mon Sep 17 00:00:00 2001 From: Sourabh Medapati Date: Fri, 28 Jul 2023 00:14:49 +0000 Subject: [PATCH 4/5] yapf fixes --- datasets/dataset_setup.py | 5 ++--- datasets/librispeech_preprocess.py | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py index 8bfc8e4f2..2c9675bd3 100644 --- a/datasets/dataset_setup.py +++ b/datasets/dataset_setup.py @@ -481,7 +481,6 @@ def download_librispeech(dataset_dir, tmp_dir): f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', shell=True).communicate() - tars = [ 'raw-metadata.tar.gz', 'train-clean-100.tar.gz', @@ -494,8 +493,8 @@ def download_librispeech(dataset_dir, tmp_dir): subprocess.Popen(wget_cmd, shell=True).communicate() tar_path = os.path.join(tmp_librispeech_dir, tar_filename) subprocess.Popen( - f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', - shell=True).communicate() + f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', + shell=True).communicate() tokenizer_vocab_path = os.path.join(extracted_data_dir, 'spm_model.vocab') diff --git a/datasets/librispeech_preprocess.py b/datasets/librispeech_preprocess.py index b938c8428..0968f2a00 100644 --- a/datasets/librispeech_preprocess.py +++ b/datasets/librispeech_preprocess.py @@ -22,7 +22,6 @@ exists = tf.io.gfile.exists rename = tf.io.gfile.rename - TRANSCRIPTION_MAX_LENGTH = 256 AUDIO_MAX_LENGTH = 320000 From 1ceb96d237d5be684d1bd1bb1596353ee62b8167 Mon Sep 17 00:00:00 2001 From: Sourabh Medapati Date: Fri, 28 Jul 2023 00:16:35 +0000 Subject: [PATCH 5/5] yapf fixes --- datasets/dataset_setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py index 2c9675bd3..5b9f02904 100644 --- a/datasets/dataset_setup.py +++ b/datasets/dataset_setup.py @@ -478,7 +478,7 @@ def download_librispeech(dataset_dir, tmp_dir): subprocess.Popen(wget_cmd, shell=True).communicate() tar_path = os.path.join(tmp_librispeech_dir, f'{split}-{version}.tar.gz') subprocess.Popen( - f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', + f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', shell=True).communicate() tars = [ @@ -493,7 +493,7 @@ def download_librispeech(dataset_dir, tmp_dir): subprocess.Popen(wget_cmd, shell=True).communicate() tar_path = os.path.join(tmp_librispeech_dir, tar_filename) subprocess.Popen( - f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', + f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', shell=True).communicate() tokenizer_vocab_path = os.path.join(extracted_data_dir, 'spm_model.vocab')