diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py index bc4502a24..0c760cd8a 100644 --- a/datasets/dataset_setup.py +++ b/datasets/dataset_setup.py @@ -94,6 +94,9 @@ from datasets import librispeech_preprocess from datasets import librispeech_tokenizer +os.environ["CUDA_VISIBLE_DEVICES"] = "-1" +tf.config.set_visible_devices([], 'GPU') + flags.DEFINE_boolean( 'interactive_deletion', True, @@ -169,7 +172,6 @@ 'The number of threads to use in parallel when decompressing.') flags.DEFINE_string('framework', None, 'Can be either jax or pytorch.') -flags.DEFINE_boolean('train_tokenizer', True, 'Train Librispeech tokenizer.') FLAGS = flags.FLAGS @@ -458,17 +460,26 @@ def download_imagenet_v2(data_dir): data_dir=data_dir).download_and_prepare() -def download_librispeech(dataset_dir, tmp_dir, train_tokenizer): +def download_librispeech(dataset_dir, tmp_dir): # After extraction the result is a folder named Librispeech containing audio # files in .flac format along with transcripts containing name of audio file # and corresponding transcription. - tmp_librispeech_dir = os.path.join(tmp_dir, 'LibriSpeech') + tmp_librispeech_dir = os.path.join(tmp_dir, 'librispeech') + extracted_data_dir = os.path.join(tmp_librispeech_dir, 'LibriSpeech') + final_data_dir = os.path.join(dataset_dir, 'librispeech_processed') + _maybe_mkdir(tmp_librispeech_dir) + _maybe_mkdir(final_data_dir) for split in ['dev', 'test']: for version in ['clean', 'other']: - wget_cmd = f'wget http://www.openslr.org/resources/12/{split}-{version}.tar.gz -O - | tar xz' # pylint: disable=line-too-long - subprocess.Popen(wget_cmd, shell=True, cwd=tmp_dir).communicate() + wget_cmd = ( + f'wget --directory-prefix={tmp_librispeech_dir} ' + f'http://www.openslr.org/resources/12/{split}-{version}.tar.gz') + subprocess.Popen(wget_cmd, shell=True).communicate() + tar_path = os.path.join(tmp_librispeech_dir, f'{split}-{version}.tar.gz') + subprocess.Popen( + f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', shell=True).communicate() tars = [ 'raw-metadata.tar.gz', @@ -477,19 +488,21 @@ def download_librispeech(dataset_dir, tmp_dir, train_tokenizer): 'train-other-500.tar.gz', ] for tar_filename in tars: - wget_cmd = f'wget http://www.openslr.org/resources/12/{tar_filename} -O - | tar xz ' # pylint: disable=line-too-long - subprocess.Popen(wget_cmd, shell=True, cwd=tmp_dir).communicate() + wget_cmd = (f'wget --directory-prefix={tmp_librispeech_dir} ' + f'http://www.openslr.org/resources/12/{tar_filename}') + subprocess.Popen(wget_cmd, shell=True).communicate() + tar_path = os.path.join(tmp_librispeech_dir, tar_filename) + subprocess.Popen(f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', shell=True).communicate() - if train_tokenizer: - tokenizer_vocab_path = librispeech_tokenizer.run( - train=True, data_dir=tmp_librispeech_dir) + tokenizer_vocab_path = os.path.join(extracted_data_dir, 'spm_model.vocab') - # Preprocess data. - librispeech_dir = os.path.join(dataset_dir, 'librispeech') - librispeech_preprocess.run( - input_dir=tmp_librispeech_dir, - output_dir=librispeech_dir, - tokenizer_vocab_path=tokenizer_vocab_path) + if not os.path.exists(tokenizer_vocab_path): + librispeech_tokenizer.run(train=True, data_dir=extracted_data_dir) + + librispeech_preprocess.run( + input_dir=extracted_data_dir, + output_dir=final_data_dir, + tokenizer_vocab_path=tokenizer_vocab_path) def download_mnist(data_dir): @@ -577,7 +590,7 @@ def main(_): if FLAGS.all or FLAGS.librispeech: logging.info('Downloading Librispeech...') - download_librispeech(data_dir, tmp_dir, train_tokenizer=True) + download_librispeech(data_dir, tmp_dir) if FLAGS.all or FLAGS.cifar: logging.info('Downloading CIFAR...') diff --git a/datasets/librispeech_preprocess.py b/datasets/librispeech_preprocess.py index 2ce8d79ca..c1b7434eb 100644 --- a/datasets/librispeech_preprocess.py +++ b/datasets/librispeech_preprocess.py @@ -23,15 +23,6 @@ exists = tf.io.gfile.exists rename = tf.io.gfile.rename -flags.DEFINE_string('raw_input_dir', - '', - 'Path to the raw training data directory.') -flags.DEFINE_string('output_dir', '', 'Dir to write the processed data to.') -flags.DEFINE_string('tokenizer_vocab_path', - '', - 'Path to sentence piece tokenizer vocab file.') -FLAGS = flags.FLAGS - TRANSCRIPTION_MAX_LENGTH = 256 AUDIO_MAX_LENGTH = 320000 @@ -73,7 +64,7 @@ def report_progress(count, total, start_time): sys.stdout.flush() -def preprocess_data(in_folder, out_folder, tokenizer, split): +def preprocess_data(data_folder, output_folder, tokenizer, split): finished = Counter() skipped = Counter() start_time = time.time() @@ -111,8 +102,8 @@ def process(index): targets = tokenizer.tokenize(trans).numpy().astype(np.int32) - np.save('{}/{}/{}_audio.npy'.format(out_folder, split, utt), sound) - np.save('{}/{}/{}_targets.npy'.format(out_folder, split, utt), targets) + np.save('{}/{}_audio.npy'.format(output_split_dir, utt), sound) + np.save('{}/{}_targets.npy'.format(output_split_dir, utt), targets) finished.inc() report_progress(finished.val() + skipped.val(), @@ -152,11 +143,11 @@ def load_audio(audio_path): return audio -def run(input_dir, output_dir, tokenizer_vocab_path): +def run(input_dir, output_dir, tokenizer_vocab_path): tokenizer = librispeech_tokenizer.load_tokenizer(tokenizer_vocab_path) os.makedirs(output_dir, exist_ok=True) - subset_list = [ + split_list = [ 'train-clean-100', 'train-clean-360', 'train-other-500', @@ -167,22 +158,12 @@ def run(input_dir, output_dir, tokenizer_vocab_path): ] for subset in subset_list: logging.info('Processing split = %s...', subset) - in_dir = os.path.join(input_dir, subset) - out_dir = os.path.join(output_dir, subset) - os.makedirs(out_dir, exist_ok=True) - example_ids, num_entries = preprocess_data( - in_dir, output_dir, tokenizer, subset) + subset_dir = os.path.join(output_dir, subset) + os.makedirs(subset_dir, exist_ok=True) + example_ids, num_entries = preprocess_data(subset_dir, tokenizer, subset) if num_entries != librispeech_example_counts[subset]: raise ValueError('Preprocessed dataframe final count not equal to ' 'expected count: {} vs expected {}'.format( num_entries, librispeech_example_counts[subset])) example_ids.to_csv(os.path.join(output_dir, f'{subset}.csv')) - - -def main(): - run(FLAGS.input_dir, FLAGS.output_dir, FLAGS.tokenizer_vocab_path) - - -if __name__ == '__main__': - main() diff --git a/datasets/librispeech_tokenizer.py b/datasets/librispeech_tokenizer.py index 71aa719c2..97a03fece 100644 --- a/datasets/librispeech_tokenizer.py +++ b/datasets/librispeech_tokenizer.py @@ -8,7 +8,6 @@ import tempfile from typing import Dict -from absl import flags from absl import logging import sentencepiece as spm import tensorflow as tf @@ -21,13 +20,6 @@ Features = Dict[str, tf.Tensor] -flags.DEFINE_string('input_dir', '', 'Path to training data directory.') -flags.DEFINE_boolean( - 'train', - False, - 'Whether to train a new tokenizer or load existing one to test.') -FLAGS = flags.FLAGS - def dump_chars_for_training(data_folder, splits, maxchars: int = int(1e7)): char_count = 0 @@ -35,6 +27,8 @@ def dump_chars_for_training(data_folder, splits, maxchars: int = int(1e7)): delete=False, prefix='/tmp/ds_chars') as outfp: for split in splits: data_folder = data_folder + '/' + split + logging.info('data folder = ', data_folder) + logging.info('list dir = ', os.listdir(data_folder)) for _, speaker_folder in enumerate(os.listdir(data_folder)): if char_count > maxchars: break @@ -82,6 +76,9 @@ def train_tokenizer(data_dir: str, path to the trained sentencepiece vocabulary model. """ abs_model_path = os.path.abspath(os.path.expanduser(model_path)) + logging.info('inside train_tokenizer model_path = ', model_path) + logging.info('inside train_tokenizer abs_model_path = ', abs_model_path) + charfile = dump_chars_for_training(data_dir, splits, maxchars=maxchars) with tempfile.NamedTemporaryFile( @@ -115,16 +112,16 @@ def load_tokenizer(model_filepath): model=sp_model, add_bos=False, add_eos=True, reverse=False) return sp_tokenizer - def run(train, data_dir): - logging.info('Data dir: %s', data_dir) + vocab_path = os.path.join(data_dir, 'spm_model.vocab') + logging.info('vocab_path = ', vocab_path) if train: logging.info('Training...') splits = ['train-clean-100'] - return train_tokenizer(data_dir, splits) + train_tokenizer(data_dir, splits, model_path=vocab_path) else: - tokenizer = load_tokenizer(os.path.join(data_dir, 'spm_model.vocab')) + tokenizer = load_tokenizer(vocab_path) test_input = 'OPEN SOURCE ROCKS' tokens = tokenizer.tokenize(test_input) detokenized = tokenizer.detokenize(tokens).numpy().decode('utf-8') @@ -135,11 +132,3 @@ def run(train, data_dir): if detokenized == test_input: logging.info('Tokenizer working correctly!') - - -def main(): - run(FLAGS.train, FLAGS.data_dir) - - -if __name__ == '__main__': - main()