From 209f1ddabf38a0bef563e02dc73ca375a951a82b Mon Sep 17 00:00:00 2001
From: Sourabh Medapati <smedapati@google.com>
Date: Thu, 27 Jul 2023 23:57:33 +0000
Subject: [PATCH 1/5] librispeech dataset script fixes

---
 datasets/dataset_setup.py          | 46 +++++++++++++++++++-----------
 datasets/librispeech_preprocess.py | 17 -----------
 datasets/librispeech_tokenizer.py  | 22 +++-----------
 3 files changed, 33 insertions(+), 52 deletions(-)

diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py
index bc4502a24..d4c9af73c 100644
--- a/datasets/dataset_setup.py
+++ b/datasets/dataset_setup.py
@@ -169,9 +169,10 @@
     'The number of threads to use in parallel when decompressing.')
 
 flags.DEFINE_string('framework', None, 'Can be either jax or pytorch.')
-flags.DEFINE_boolean('train_tokenizer', True, 'Train Librispeech tokenizer.')
 FLAGS = flags.FLAGS
 
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+tf.config.set_visible_devices([], 'GPU')
 
 def _maybe_mkdir(d):
   if not os.path.exists(d):
@@ -458,17 +459,26 @@ def download_imagenet_v2(data_dir):
       data_dir=data_dir).download_and_prepare()
 
 
-def download_librispeech(dataset_dir, tmp_dir, train_tokenizer):
+def download_librispeech(dataset_dir, tmp_dir):
   # After extraction the result is a folder named Librispeech containing audio
   # files in .flac format along with transcripts containing name of audio file
   # and corresponding transcription.
-  tmp_librispeech_dir = os.path.join(tmp_dir, 'LibriSpeech')
+  tmp_librispeech_dir = os.path.join(tmp_dir, 'librispeech')
+  extracted_data_dir = os.path.join(tmp_librispeech_dir, 'LibriSpeech')
+  final_data_dir = os.path.join(dataset_dir, 'librispeech_processed')
+  
   _maybe_mkdir(tmp_librispeech_dir)
 
   for split in ['dev', 'test']:
     for version in ['clean', 'other']:
-      wget_cmd = f'wget http://www.openslr.org/resources/12/{split}-{version}.tar.gz -O - | tar xz'  # pylint: disable=line-too-long
-      subprocess.Popen(wget_cmd, shell=True, cwd=tmp_dir).communicate()
+      wget_cmd = (
+          f'wget --directory-prefix={tmp_librispeech_dir} '
+          f'http://www.openslr.org/resources/12/{split}-{version}.tar.gz')
+      subprocess.Popen(wget_cmd, shell=True).communicate()
+      tar_path = os.path.join(tmp_librispeech_dir, f'{split}-{version}.tar.gz')
+      subprocess.Popen(
+          f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', shell=True).communicate()
+
 
   tars = [
       'raw-metadata.tar.gz',
@@ -477,19 +487,21 @@ def download_librispeech(dataset_dir, tmp_dir, train_tokenizer):
       'train-other-500.tar.gz',
   ]
   for tar_filename in tars:
-    wget_cmd = f'wget http://www.openslr.org/resources/12/{tar_filename} -O - | tar xz '  # pylint: disable=line-too-long
-    subprocess.Popen(wget_cmd, shell=True, cwd=tmp_dir).communicate()
+     wget_cmd = (f'wget --directory-prefix={tmp_librispeech_dir} '
+                f'http://www.openslr.org/resources/12/{tar_filename}')
+    subprocess.Popen(wget_cmd, shell=True).communicate()
+    tar_path = os.path.join(tmp_librispeech_dir, tar_filename)
+    subprocess.Popen(f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', shell=True).communicate()
+
+  tokenizer_vocab_path = os.path.join(extracted_data_dir, 'spm_model.vocab')
 
-  if train_tokenizer:
-    tokenizer_vocab_path = librispeech_tokenizer.run(
-        train=True, data_dir=tmp_librispeech_dir)
+  if not os.path.exists(tokenizer_vocab_path):
+    librispeech_tokenizer.run(train=True, data_dir=extracted_data_dir)
 
-    # Preprocess data.
-    librispeech_dir = os.path.join(dataset_dir, 'librispeech')
-    librispeech_preprocess.run(
-        input_dir=tmp_librispeech_dir,
-        output_dir=librispeech_dir,
-        tokenizer_vocab_path=tokenizer_vocab_path)
+  librispeech_preprocess.run(
+      input_dir=extracted_data_dir,
+      output_dir=final_data_dir,
+      tokenizer_vocab_path=tokenizer_vocab_path)
 
 
 def download_mnist(data_dir):
@@ -577,7 +589,7 @@ def main(_):
 
   if FLAGS.all or FLAGS.librispeech:
     logging.info('Downloading Librispeech...')
-    download_librispeech(data_dir, tmp_dir, train_tokenizer=True)
+    download_librispeech(data_dir, tmp_dir)
 
   if FLAGS.all or FLAGS.cifar:
     logging.info('Downloading CIFAR...')
diff --git a/datasets/librispeech_preprocess.py b/datasets/librispeech_preprocess.py
index 2ce8d79ca..b938c8428 100644
--- a/datasets/librispeech_preprocess.py
+++ b/datasets/librispeech_preprocess.py
@@ -9,7 +9,6 @@
 import threading
 import time
 
-from absl import flags
 from absl import logging
 import numpy as np
 import pandas as pd
@@ -23,14 +22,6 @@
 exists = tf.io.gfile.exists
 rename = tf.io.gfile.rename
 
-flags.DEFINE_string('raw_input_dir',
-                    '',
-                    'Path to the raw training data directory.')
-flags.DEFINE_string('output_dir', '', 'Dir to write the processed data to.')
-flags.DEFINE_string('tokenizer_vocab_path',
-                    '',
-                    'Path to sentence piece tokenizer vocab file.')
-FLAGS = flags.FLAGS
 
 TRANSCRIPTION_MAX_LENGTH = 256
 AUDIO_MAX_LENGTH = 320000
@@ -178,11 +169,3 @@ def run(input_dir, output_dir, tokenizer_vocab_path):
                        'expected count: {} vs expected {}'.format(
                            num_entries, librispeech_example_counts[subset]))
     example_ids.to_csv(os.path.join(output_dir, f'{subset}.csv'))
-
-
-def main():
-  run(FLAGS.input_dir, FLAGS.output_dir, FLAGS.tokenizer_vocab_path)
-
-
-if __name__ == '__main__':
-  main()
diff --git a/datasets/librispeech_tokenizer.py b/datasets/librispeech_tokenizer.py
index 71aa719c2..e701d59d4 100644
--- a/datasets/librispeech_tokenizer.py
+++ b/datasets/librispeech_tokenizer.py
@@ -8,7 +8,6 @@
 import tempfile
 from typing import Dict
 
-from absl import flags
 from absl import logging
 import sentencepiece as spm
 import tensorflow as tf
@@ -21,13 +20,6 @@
 
 Features = Dict[str, tf.Tensor]
 
-flags.DEFINE_string('input_dir', '', 'Path to training data directory.')
-flags.DEFINE_boolean(
-    'train',
-    False,
-    'Whether to train a new tokenizer or load existing one to test.')
-FLAGS = flags.FLAGS
-
 
 def dump_chars_for_training(data_folder, splits, maxchars: int = int(1e7)):
   char_count = 0
@@ -118,13 +110,15 @@ def load_tokenizer(model_filepath):
 
 def run(train, data_dir):
   logging.info('Data dir: %s', data_dir)
+  vocab_path = os.path.join(data_dir, 'spm_model.vocab')
+  logging.info('vocab_path = ', vocab_path)
 
   if train:
     logging.info('Training...')
     splits = ['train-clean-100']
-    return train_tokenizer(data_dir, splits)
+    train_tokenizer(data_dir, splits, model_path=vocab_path)
   else:
-    tokenizer = load_tokenizer(os.path.join(data_dir, 'spm_model.vocab'))
+    tokenizer = load_tokenizer(vocab_path)
     test_input = 'OPEN SOURCE ROCKS'
     tokens = tokenizer.tokenize(test_input)
     detokenized = tokenizer.detokenize(tokens).numpy().decode('utf-8')
@@ -135,11 +129,3 @@ def run(train, data_dir):
 
     if detokenized == test_input:
       logging.info('Tokenizer working correctly!')
-
-
-def main():
-  run(FLAGS.train, FLAGS.data_dir)
-
-
-if __name__ == '__main__':
-  main()

From f4315368bbd671085e835fee240202900dd1f7bf Mon Sep 17 00:00:00 2001
From: Sourabh Medapati <smedapati@google.com>
Date: Thu, 27 Jul 2023 23:59:56 +0000
Subject: [PATCH 2/5] identation fix

---
 datasets/dataset_setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py
index d4c9af73c..37e49cd84 100644
--- a/datasets/dataset_setup.py
+++ b/datasets/dataset_setup.py
@@ -487,7 +487,7 @@ def download_librispeech(dataset_dir, tmp_dir):
       'train-other-500.tar.gz',
   ]
   for tar_filename in tars:
-     wget_cmd = (f'wget --directory-prefix={tmp_librispeech_dir} '
+    wget_cmd = (f'wget --directory-prefix={tmp_librispeech_dir} '
                 f'http://www.openslr.org/resources/12/{tar_filename}')
     subprocess.Popen(wget_cmd, shell=True).communicate()
     tar_path = os.path.join(tmp_librispeech_dir, tar_filename)

From 2d0f5d46332fd5cb3e99fbb8033e47e618110bea Mon Sep 17 00:00:00 2001
From: Sourabh Medapati <smedapati@google.com>
Date: Fri, 28 Jul 2023 00:08:47 +0000
Subject: [PATCH 3/5] yapf fixes

---
 datasets/dataset_setup.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py
index 37e49cd84..8bfc8e4f2 100644
--- a/datasets/dataset_setup.py
+++ b/datasets/dataset_setup.py
@@ -174,6 +174,7 @@
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 tf.config.set_visible_devices([], 'GPU')
 
+
 def _maybe_mkdir(d):
   if not os.path.exists(d):
     os.makedirs(d)
@@ -466,7 +467,7 @@ def download_librispeech(dataset_dir, tmp_dir):
   tmp_librispeech_dir = os.path.join(tmp_dir, 'librispeech')
   extracted_data_dir = os.path.join(tmp_librispeech_dir, 'LibriSpeech')
   final_data_dir = os.path.join(dataset_dir, 'librispeech_processed')
-  
+
   _maybe_mkdir(tmp_librispeech_dir)
 
   for split in ['dev', 'test']:
@@ -477,7 +478,8 @@ def download_librispeech(dataset_dir, tmp_dir):
       subprocess.Popen(wget_cmd, shell=True).communicate()
       tar_path = os.path.join(tmp_librispeech_dir, f'{split}-{version}.tar.gz')
       subprocess.Popen(
-          f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', shell=True).communicate()
+          f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', 
+          shell=True).communicate()
 
 
   tars = [
@@ -491,7 +493,9 @@ def download_librispeech(dataset_dir, tmp_dir):
                 f'http://www.openslr.org/resources/12/{tar_filename}')
     subprocess.Popen(wget_cmd, shell=True).communicate()
     tar_path = os.path.join(tmp_librispeech_dir, tar_filename)
-    subprocess.Popen(f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', shell=True).communicate()
+    subprocess.Popen(
+      f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', 
+      shell=True).communicate()
 
   tokenizer_vocab_path = os.path.join(extracted_data_dir, 'spm_model.vocab')
 

From 32eab1255458eb290924fc527ddd7487cdfd7bf5 Mon Sep 17 00:00:00 2001
From: Sourabh Medapati <smedapati@google.com>
Date: Fri, 28 Jul 2023 00:14:49 +0000
Subject: [PATCH 4/5] yapf fixes

---
 datasets/dataset_setup.py          | 5 ++---
 datasets/librispeech_preprocess.py | 1 -
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py
index 8bfc8e4f2..2c9675bd3 100644
--- a/datasets/dataset_setup.py
+++ b/datasets/dataset_setup.py
@@ -481,7 +481,6 @@ def download_librispeech(dataset_dir, tmp_dir):
           f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', 
           shell=True).communicate()
 
-
   tars = [
       'raw-metadata.tar.gz',
       'train-clean-100.tar.gz',
@@ -494,8 +493,8 @@ def download_librispeech(dataset_dir, tmp_dir):
     subprocess.Popen(wget_cmd, shell=True).communicate()
     tar_path = os.path.join(tmp_librispeech_dir, tar_filename)
     subprocess.Popen(
-      f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', 
-      shell=True).communicate()
+        f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', 
+        shell=True).communicate()
 
   tokenizer_vocab_path = os.path.join(extracted_data_dir, 'spm_model.vocab')
 
diff --git a/datasets/librispeech_preprocess.py b/datasets/librispeech_preprocess.py
index b938c8428..0968f2a00 100644
--- a/datasets/librispeech_preprocess.py
+++ b/datasets/librispeech_preprocess.py
@@ -22,7 +22,6 @@
 exists = tf.io.gfile.exists
 rename = tf.io.gfile.rename
 
-
 TRANSCRIPTION_MAX_LENGTH = 256
 AUDIO_MAX_LENGTH = 320000
 

From 1ceb96d237d5be684d1bd1bb1596353ee62b8167 Mon Sep 17 00:00:00 2001
From: Sourabh Medapati <smedapati@google.com>
Date: Fri, 28 Jul 2023 00:16:35 +0000
Subject: [PATCH 5/5] yapf fixes

---
 datasets/dataset_setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py
index 2c9675bd3..5b9f02904 100644
--- a/datasets/dataset_setup.py
+++ b/datasets/dataset_setup.py
@@ -478,7 +478,7 @@ def download_librispeech(dataset_dir, tmp_dir):
       subprocess.Popen(wget_cmd, shell=True).communicate()
       tar_path = os.path.join(tmp_librispeech_dir, f'{split}-{version}.tar.gz')
       subprocess.Popen(
-          f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', 
+          f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}',
           shell=True).communicate()
 
   tars = [
@@ -493,7 +493,7 @@ def download_librispeech(dataset_dir, tmp_dir):
     subprocess.Popen(wget_cmd, shell=True).communicate()
     tar_path = os.path.join(tmp_librispeech_dir, tar_filename)
     subprocess.Popen(
-        f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', 
+        f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}',
         shell=True).communicate()
 
   tokenizer_vocab_path = os.path.join(extracted_data_dir, 'spm_model.vocab')