-
Notifications
You must be signed in to change notification settings - Fork 0
/
synthesize.py
190 lines (172 loc) · 6.48 KB
/
synthesize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# pylint: disable=redefined-outer-name, unused-argument
import os
import time
import argparse
import torch
import json
import string
from TTS.utils.synthesis import synthesis
from TTS.utils.generic_utils import setup_model
from TTS.utils.io import load_config
from TTS.utils.text.symbols import make_symbols, symbols, phonemes
from TTS.utils.audio import AudioProcessor
#audio display, didnt work
#import numpy as np
#from IPython.display import Audio
#from scipy.io import wavfile
def tts(model,
vocoder_model,
C,
VC,
text,
ap,
ap_vocoder,
use_cuda,
batched_vocoder,
speaker_id=None,
figures=False):
t_1 = time.time()
use_vocoder_model = vocoder_model is not None
waveform, alignment, _, postnet_output, stop_tokens, _ = synthesis(
model, text, C, use_cuda, ap, speaker_id, style_wav=False,
truncated=False, enable_eos_bos_chars=C.enable_eos_bos_chars,
use_griffin_lim=(not use_vocoder_model), do_trim_silence=True)
if C.model == "Tacotron" and use_vocoder_model:
postnet_output = ap.out_linear_to_mel(postnet_output.T).T
# correct if there is a scale difference b/w two models
if use_vocoder_model:
postnet_output = ap._denormalize(postnet_output)
postnet_output = ap_vocoder._normalize(postnet_output)
vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
waveform = vocoder_model.generate(
vocoder_input.cuda() if use_cuda else vocoder_input,
batched=batched_vocoder,
target=8000,
overlap=400)
print(" > Run-time: {}".format(time.time() - t_1))
return alignment, postnet_output, stop_tokens, waveform
if __name__ == "__main__":
global symbols, phonemes
parser = argparse.ArgumentParser()
parser.add_argument('text', type=str, help='Text to generate speech.')
parser.add_argument('config_path',
type=str,
help='Path to model config file.',
default="./pretrained/config.json")
parser.add_argument(
'model_path',
type=str,
help='Path to model file.',
default="./pretrained/best_model.pth.tar"
)
parser.add_argument(
'out_path',
type=str,
help='Path to save final wav file. Wav file will be names as the text given.',
)
parser.add_argument('--use_cuda',
type=bool,
help='Run model on CUDA.',
default=True)
parser.add_argument(
'--vocoder_path',
type=str,
help=
'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).',
default="",
)
parser.add_argument('--vocoder_config_path',
type=str,
help='Path to vocoder model config file.',
default="")
parser.add_argument(
'--batched_vocoder',
type=bool,
help="If True, vocoder model uses faster batch processing.",
default=True)
parser.add_argument('--speakers_json',
type=str,
help="JSON file for multi-speaker model.",
default="")
parser.add_argument(
'--speaker_id',
type=int,
help="target speaker_id if the model is multi-speaker.",
default=None)
args = parser.parse_args()
if args.vocoder_path != "":
assert args.use_cuda, " [!] Enable cuda for vocoder."
from WaveRNN.models.wavernn import Model as VocoderModel
# load the config
# args.config_path = args.config_path
C = load_config(args.config_path)
C.forward_attn_mask = True
# load the audio processor
ap = AudioProcessor(**C.audio)
# if the vocabulary was passed, replace the default
if 'characters' in C.keys():
symbols, phonemes = make_symbols(**C.characters)
# load speakers
if args.speakers_json != '':
speakers = json.load(open(args.speakers_json, 'r'))
num_speakers = len(speakers)
else:
num_speakers = 0
# load the model
# args.model_path=args.config_path+"/"+args.model_path
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
model = setup_model(num_chars, num_speakers, C)
cp = torch.load(args.model_path)
model.load_state_dict(cp['model'])
model.eval()
if args.use_cuda:
model.cuda()
model.decoder.set_r(cp['r'])
# load vocoder model
if args.vocoder_path != "":
VC = load_config(args.vocoder_config_path)
ap_vocoder = AudioProcessor(**VC.audio)
bits = 10
vocoder_model = VocoderModel(rnn_dims=512,
fc_dims=512,
mode=VC.mode,
mulaw=VC.mulaw,
pad=VC.pad,
upsample_factors=VC.upsample_factors,
feat_dims=VC.audio["num_mels"],
compute_dims=128,
res_out_dims=128,
res_blocks=10,
hop_length=ap.hop_length,
sample_rate=ap.sample_rate,
use_aux_net=True,
use_upsample_net=True)
check = torch.load(args.vocoder_path)
vocoder_model.load_state_dict(check['model'])
vocoder_model.eval()
if args.use_cuda:
vocoder_model.cuda()
else:
vocoder_model = None
VC = None
ap_vocoder = None
# synthesize voice
print(" > Text: {}".format(args.text))
_, _, _, wav = tts(model,
vocoder_model,
C,
VC,
args.text,
ap,
ap_vocoder,
args.use_cuda,
args.batched_vocoder,
speaker_id=args.speaker_id,
figures=False)
# save the results
#file_name = args.text.replace(" ", "_")
#file_name = file_name.translate(
# str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
out_path = args.out_path
print(" > Saving output to {}".format(out_path))
ap.save_wav(wav, out_path)