From 9166d8b33f64b4ad34b4bbfd710067a31eecc920 Mon Sep 17 00:00:00 2001 From: Ajay Raj Date: Thu, 3 Aug 2023 16:16:10 -0700 Subject: [PATCH] only tune silence for larger utterances (#336) --- vocode/streaming/synthesizer/azure_synthesizer.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/vocode/streaming/synthesizer/azure_synthesizer.py b/vocode/streaming/synthesizer/azure_synthesizer.py index dd7f3e711..7b91f2f46 100644 --- a/vocode/streaming/synthesizer/azure_synthesizer.py +++ b/vocode/streaming/synthesizer/azure_synthesizer.py @@ -184,11 +184,16 @@ def create_ssml( "styledegree", str(bot_sentiment.degree * 2) ) # Azure specific, it's a scale of 0-2 voice_root = styled - silence = ElementTree.SubElement( - voice_root, "{%s}silence" % NAMESPACES.get("mstts") - ) - silence.set("value", "500ms") - silence.set("type", "Tailing-exact") + # this ugly hack is necessary so we can limit the gap between sentences + # for normal sentences, it seems like the gap is > 500ms, so we're able to reduce it to 500ms + # for very tiny sentences, the API hangs - so we heuristically only update the silence gap + # if there is more than one word in the sentence + if " " in message: + silence = ElementTree.SubElement( + voice_root, "{%s}silence" % NAMESPACES.get("mstts") + ) + silence.set("value", "500ms") + silence.set("type", "Tailing-exact") prosody = ElementTree.SubElement(voice_root, "prosody") prosody.set("pitch", f"{self.pitch}%") prosody.set("rate", f"{self.rate}%")