diff --git a/test/test_streaming.py b/test/test_streaming.py index f8bbae2..3e35409 100644 --- a/test/test_streaming.py +++ b/test/test_streaming.py @@ -38,12 +38,14 @@ def stream_ffplay(audio_stream, output_file, save=True): ffplay_proc.wait() -def tts(text, speaker) -> Iterator[bytes]: +def tts(text, speaker,language, server_url , decoder, stream_chunk_size) -> Iterator[bytes]: start = time.perf_counter() speaker["text"] = text - speaker["language"] = "en" + speaker["language"] = language + speaker["decoder"] = decoder # "hifigan" or "ne_hifigan" for TTS>0.19.0 + speaker["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality res = requests.post( - "http://localhost:8000/tts_stream", + f"{server_url}/tts_stream", json=speaker, stream=True, ) @@ -66,9 +68,9 @@ def tts(text, speaker) -> Iterator[bytes]: print("⏱️ response.elapsed:", res.elapsed) -def get_speaker(ref_audio): +def get_speaker(ref_audio,server_url): files = {"wav_file": ("reference.wav", open(ref_audio, "rb"))} - response = requests.post("http://localhost:8000/clone_speaker", files=files) + response = requests.post(f"{server_url}/clone_speaker", files=files) return response.json() @@ -77,15 +79,41 @@ def get_speaker(ref_audio): parser.add_argument( "--text", default="It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + help="text input for TTS" ) + parser.add_argument( + "--language", + default="en", + help="Language to use default is 'en' (English)" + ) + parser.add_argument( "--output_file", default=None, + help="Save TTS output to given filename" ) parser.add_argument( "--ref_file", default=None, + help="Reference audio file to use, when not given will use default" ) + parser.add_argument( + "--server_url", + default="http://localhost:8000", + help="Server url http://localhost:8000 default, change to your server location " + ) + parser.add_argument( + "--decoder", + default="ne_hifigan", + help="Decoder for vocoder, ne_hifigan default, options ne_hifigan or hifigan" + ) + + parser.add_argument( + "--stream_chunk_size", + default="20", + help="Stream chunk size , 20 default, reducing will get faster latency but may degrade quality" + ) + args = parser.parse_args() with open("./default_speaker.json", "r") as file: @@ -93,6 +121,6 @@ def get_speaker(ref_audio): if args.ref_file is not None: print("Computing the latents for a new reference...") - speaker = get_speaker(args.ref_file) + speaker = get_speaker(args.ref_file,args.server_url) - audio = stream_ffplay(tts(args.text, speaker), args.output_file, save=bool(args.output_file)) + audio = stream_ffplay(tts(args.text, speaker,args.language,args.server_url,args.decoder,args.stream_chunk_size), args.output_file, save=bool(args.output_file))