Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixing librispeech dataset setup script #464

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 30 additions & 17 deletions datasets/dataset_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@
from datasets import librispeech_preprocess
from datasets import librispeech_tokenizer

os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
tf.config.set_visible_devices([], 'GPU')

flags.DEFINE_boolean(
'interactive_deletion',
True,
Expand Down Expand Up @@ -169,7 +172,6 @@
'The number of threads to use in parallel when decompressing.')

flags.DEFINE_string('framework', None, 'Can be either jax or pytorch.')
flags.DEFINE_boolean('train_tokenizer', True, 'Train Librispeech tokenizer.')
FLAGS = flags.FLAGS


Expand Down Expand Up @@ -458,17 +460,26 @@ def download_imagenet_v2(data_dir):
data_dir=data_dir).download_and_prepare()


def download_librispeech(dataset_dir, tmp_dir, train_tokenizer):
def download_librispeech(dataset_dir, tmp_dir):
# After extraction the result is a folder named Librispeech containing audio
# files in .flac format along with transcripts containing name of audio file
# and corresponding transcription.
tmp_librispeech_dir = os.path.join(tmp_dir, 'LibriSpeech')
tmp_librispeech_dir = os.path.join(tmp_dir, 'librispeech')
extracted_data_dir = os.path.join(tmp_librispeech_dir, 'LibriSpeech')
final_data_dir = os.path.join(dataset_dir, 'librispeech_processed')

_maybe_mkdir(tmp_librispeech_dir)
_maybe_mkdir(final_data_dir)

for split in ['dev', 'test']:
for version in ['clean', 'other']:
wget_cmd = f'wget http://www.openslr.org/resources/12/{split}-{version}.tar.gz -O - | tar xz' # pylint: disable=line-too-long
subprocess.Popen(wget_cmd, shell=True, cwd=tmp_dir).communicate()
wget_cmd = (
f'wget --directory-prefix={tmp_librispeech_dir} '
f'http://www.openslr.org/resources/12/{split}-{version}.tar.gz')
subprocess.Popen(wget_cmd, shell=True).communicate()
tar_path = os.path.join(tmp_librispeech_dir, f'{split}-{version}.tar.gz')
subprocess.Popen(
f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', shell=True).communicate()

tars = [
'raw-metadata.tar.gz',
Expand All @@ -477,19 +488,21 @@ def download_librispeech(dataset_dir, tmp_dir, train_tokenizer):
'train-other-500.tar.gz',
]
for tar_filename in tars:
wget_cmd = f'wget http://www.openslr.org/resources/12/{tar_filename} -O - | tar xz ' # pylint: disable=line-too-long
subprocess.Popen(wget_cmd, shell=True, cwd=tmp_dir).communicate()
wget_cmd = (f'wget --directory-prefix={tmp_librispeech_dir} '
f'http://www.openslr.org/resources/12/{tar_filename}')
subprocess.Popen(wget_cmd, shell=True).communicate()
tar_path = os.path.join(tmp_librispeech_dir, tar_filename)
subprocess.Popen(f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', shell=True).communicate()

if train_tokenizer:
tokenizer_vocab_path = librispeech_tokenizer.run(
train=True, data_dir=tmp_librispeech_dir)
tokenizer_vocab_path = os.path.join(extracted_data_dir, 'spm_model.vocab')

# Preprocess data.
librispeech_dir = os.path.join(dataset_dir, 'librispeech')
librispeech_preprocess.run(
input_dir=tmp_librispeech_dir,
output_dir=librispeech_dir,
tokenizer_vocab_path=tokenizer_vocab_path)
if not os.path.exists(tokenizer_vocab_path):
librispeech_tokenizer.run(train=True, data_dir=extracted_data_dir)

librispeech_preprocess.run(
input_dir=extracted_data_dir,
output_dir=final_data_dir,
tokenizer_vocab_path=tokenizer_vocab_path)


def download_mnist(data_dir):
Expand Down Expand Up @@ -577,7 +590,7 @@ def main(_):

if FLAGS.all or FLAGS.librispeech:
logging.info('Downloading Librispeech...')
download_librispeech(data_dir, tmp_dir, train_tokenizer=True)
download_librispeech(data_dir, tmp_dir)

if FLAGS.all or FLAGS.cifar:
logging.info('Downloading CIFAR...')
Expand Down
35 changes: 8 additions & 27 deletions datasets/librispeech_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,6 @@
exists = tf.io.gfile.exists
rename = tf.io.gfile.rename

flags.DEFINE_string('raw_input_dir',
'',
'Path to the raw training data directory.')
flags.DEFINE_string('output_dir', '', 'Dir to write the processed data to.')
flags.DEFINE_string('tokenizer_vocab_path',
'',
'Path to sentence piece tokenizer vocab file.')
FLAGS = flags.FLAGS

TRANSCRIPTION_MAX_LENGTH = 256
AUDIO_MAX_LENGTH = 320000

Expand Down Expand Up @@ -73,7 +64,7 @@ def report_progress(count, total, start_time):
sys.stdout.flush()


def preprocess_data(in_folder, out_folder, tokenizer, split):
def preprocess_data(data_folder, output_folder, tokenizer, split):
finished = Counter()
skipped = Counter()
start_time = time.time()
Expand Down Expand Up @@ -111,8 +102,8 @@ def process(index):

targets = tokenizer.tokenize(trans).numpy().astype(np.int32)

np.save('{}/{}/{}_audio.npy'.format(out_folder, split, utt), sound)
np.save('{}/{}/{}_targets.npy'.format(out_folder, split, utt), targets)
np.save('{}/{}_audio.npy'.format(output_split_dir, utt), sound)
np.save('{}/{}_targets.npy'.format(output_split_dir, utt), targets)

finished.inc()
report_progress(finished.val() + skipped.val(),
Expand Down Expand Up @@ -152,11 +143,11 @@ def load_audio(audio_path):
return audio


def run(input_dir, output_dir, tokenizer_vocab_path):
def run(input_dir, output_dir, tokenizer_vocab_path):
tokenizer = librispeech_tokenizer.load_tokenizer(tokenizer_vocab_path)
os.makedirs(output_dir, exist_ok=True)

subset_list = [
split_list = [
'train-clean-100',
'train-clean-360',
'train-other-500',
Expand All @@ -167,22 +158,12 @@ def run(input_dir, output_dir, tokenizer_vocab_path):
]
for subset in subset_list:
logging.info('Processing split = %s...', subset)
in_dir = os.path.join(input_dir, subset)
out_dir = os.path.join(output_dir, subset)
os.makedirs(out_dir, exist_ok=True)
example_ids, num_entries = preprocess_data(
in_dir, output_dir, tokenizer, subset)
subset_dir = os.path.join(output_dir, subset)
os.makedirs(subset_dir, exist_ok=True)
example_ids, num_entries = preprocess_data(subset_dir, tokenizer, subset)

if num_entries != librispeech_example_counts[subset]:
raise ValueError('Preprocessed dataframe final count not equal to '
'expected count: {} vs expected {}'.format(
num_entries, librispeech_example_counts[subset]))
example_ids.to_csv(os.path.join(output_dir, f'{subset}.csv'))


def main():
run(FLAGS.input_dir, FLAGS.output_dir, FLAGS.tokenizer_vocab_path)


if __name__ == '__main__':
main()
29 changes: 9 additions & 20 deletions datasets/librispeech_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import tempfile
from typing import Dict

from absl import flags
from absl import logging
import sentencepiece as spm
import tensorflow as tf
Expand All @@ -21,20 +20,15 @@

Features = Dict[str, tf.Tensor]

flags.DEFINE_string('input_dir', '', 'Path to training data directory.')
flags.DEFINE_boolean(
'train',
False,
'Whether to train a new tokenizer or load existing one to test.')
FLAGS = flags.FLAGS


def dump_chars_for_training(data_folder, splits, maxchars: int = int(1e7)):
char_count = 0
with tempfile.NamedTemporaryFile(
delete=False, prefix='/tmp/ds_chars') as outfp:
for split in splits:
data_folder = data_folder + '/' + split
logging.info('data folder = ', data_folder)
logging.info('list dir = ', os.listdir(data_folder))
for _, speaker_folder in enumerate(os.listdir(data_folder)):
if char_count > maxchars:
break
Expand Down Expand Up @@ -82,6 +76,9 @@ def train_tokenizer(data_dir: str,
path to the trained sentencepiece vocabulary model.
"""
abs_model_path = os.path.abspath(os.path.expanduser(model_path))
logging.info('inside train_tokenizer model_path = ', model_path)
logging.info('inside train_tokenizer abs_model_path = ', abs_model_path)

charfile = dump_chars_for_training(data_dir, splits, maxchars=maxchars)

with tempfile.NamedTemporaryFile(
Expand Down Expand Up @@ -115,16 +112,16 @@ def load_tokenizer(model_filepath):
model=sp_model, add_bos=False, add_eos=True, reverse=False)
return sp_tokenizer


def run(train, data_dir):
logging.info('Data dir: %s', data_dir)
vocab_path = os.path.join(data_dir, 'spm_model.vocab')
logging.info('vocab_path = ', vocab_path)

if train:
logging.info('Training...')
splits = ['train-clean-100']
return train_tokenizer(data_dir, splits)
train_tokenizer(data_dir, splits, model_path=vocab_path)
else:
tokenizer = load_tokenizer(os.path.join(data_dir, 'spm_model.vocab'))
tokenizer = load_tokenizer(vocab_path)
test_input = 'OPEN SOURCE ROCKS'
tokens = tokenizer.tokenize(test_input)
detokenized = tokenizer.detokenize(tokens).numpy().decode('utf-8')
Expand All @@ -135,11 +132,3 @@ def run(train, data_dir):

if detokenized == test_input:
logging.info('Tokenizer working correctly!')


def main():
run(FLAGS.train, FLAGS.data_dir)


if __name__ == '__main__':
main()
Loading