This repository has been archived by the owner on Mar 10, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathdemo.py
142 lines (105 loc) · 4.77 KB
/
demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
import logging
import sys
import math
import tensorflow as tf
import utils
# Logging configuration.
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG,
stream=sys.stdout)
# Model path.
MODEL_PATH = "./models/model.ckpt"
# Summary directory.
SUMMARY_PATH = "./logs/"
# Data directories.
DATA_DIR = "./data/LibriSpeech/"
TRAIN_DIR = DATA_DIR + "train-clean-100-wav/"
TEST_DIR = DATA_DIR + "test-clean-wav/"
DEV_DIR = DATA_DIR + "dev-clean-wav/"
# Constants.
SPACE_TOKEN = '<space>'
SPACE_INDEX = 0
FIRST_INDEX = ord('a') - 1 # 0 is reserved to space
# Number of features.
NUM_FEATURES = 13
# Accounting the 0th index + space + blank label = 28 characters
NUM_CLASSES = ord('z') - ord('a') + 1 + 1 + 1
# Hyper-parameters.
NUM_EPOCHS = 200
NUM_HIDDEN = 50
NUM_LAYERS = 1
BATCH_SIZE = 1
# Optimizer parameters.
INITIAL_LEARNING_RATE = 1e-2
MOMENTUM = 0.9
def main(argv):
# Read test data files.
test_texts = utils.read_text_files(TEST_DIR)
test_labels = utils.texts_encoder(test_texts,
first_index=FIRST_INDEX,
space_index=SPACE_INDEX,
space_token=SPACE_TOKEN)
test_labels = utils.sparse_tuples_from_sequences(test_labels)
test_inputs = utils.read_audio_files(DEV_DIR)
test_inputs = utils.standardize_audios(test_inputs)
test_sequence_lengths = utils.get_sequence_lengths(test_inputs)
test_inputs = utils.make_sequences_same_length(test_inputs, test_sequence_lengths)
with tf.device('/cpu:0'):
config = tf.ConfigProto()
graph = tf.Graph()
with graph.as_default():
logging.debug("Starting new TensorFlow graph.")
inputs_placeholder = tf.placeholder(tf.float32, [None, None, NUM_FEATURES])
# SparseTensor placeholder required by ctc_loss op.
labels_placeholder = tf.sparse_placeholder(tf.int32)
# 1d array of size [batch_size].
sequence_length_placeholder = tf.placeholder(tf.int32, [None])
# Defining the cell.
cell = tf.contrib.rnn.LSTMCell(NUM_HIDDEN, state_is_tuple=True)
# Stacking rnn cells.
stack = tf.contrib.rnn.MultiRNNCell([cell] * NUM_LAYERS,
state_is_tuple=True)
# Creates a recurrent neural network.
outputs, _ = tf.nn.dynamic_rnn(stack, inputs_placeholder, sequence_length_placeholder, dtype=tf.float32)
shape = tf.shape(inputs_placeholder)
batch_size, max_time_steps = shape[0], shape[1]
# Reshaping to apply the same weights over the time steps.
outputs = tf.reshape(outputs, [-1, NUM_HIDDEN])
weights = tf.Variable(tf.truncated_normal([NUM_HIDDEN, NUM_CLASSES], stddev=0.1),
name='weights')
bias = tf.Variable(tf.constant(0., shape=[NUM_CLASSES]),
name='bias')
# Doing the affine projection.
logits = tf.matmul(outputs, weights) + bias
# Reshaping back to the original shape.
logits = tf.reshape(logits, [batch_size, -1, NUM_CLASSES])
# Time is major.
logits = tf.transpose(logits, (1, 0, 2))
# CTC decoder.
decoded, neg_sum_logits = tf.nn.ctc_greedy_decoder(logits, sequence_length_placeholder)
with tf.Session(config=config, graph=graph) as session:
logging.debug("Starting TensorFlow session.")
# Initialize the weights and biases.
tf.global_variables_initializer().run()
# Saver op to save and restore all the variables.
saver = tf.train.Saver()
# Restore model weights from previously saved model.
saver.restore(session, MODEL_PATH)
test_feed = {inputs_placeholder: test_inputs,
sequence_length_placeholder: test_sequence_lengths}
# Decoding.
decoded_outputs = session.run(decoded[0], feed_dict=test_feed)
dense_decoded = tf.sparse_tensor_to_dense(decoded_outputs, default_value=-1).eval(session=session)
test_num = test_texts.shape[0]
for i, sequence in enumerate(dense_decoded):
sequence = [s for s in sequence if s != -1]
decoded_text = utils.sequence_decoder(sequence)
logging.info("Sequence %d/%d", i + 1, test_num)
logging.info("Original:\n%s", test_texts[i])
logging.info("Decoded:\n%s", decoded_text)
if __name__ == '__main__':
tf.app.run()