From 0275c1b9dc85def8337e263b8512d519e9686768 Mon Sep 17 00:00:00 2001 From: Yulin Li Date: Tue, 4 Jul 2023 20:01:13 +0800 Subject: [PATCH 1/2] add a sample to show how to synthesize the audio with auto style prediction --- samples/python/console/README.md | 4 + .../speech_synthesis_auto_style_sample.py | 96 +++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 samples/python/console/speech_synthesis_auto_style_sample.py diff --git a/samples/python/console/README.md b/samples/python/console/README.md index 788b51745..fca0a6936 100644 --- a/samples/python/console/README.md +++ b/samples/python/console/README.md @@ -38,6 +38,10 @@ This sample demonstrates various forms of speech recognition, intent recognition ```sh pip install scipy ``` +* For speech synthesis with auto style sample, install the openai package in your Python interpreter by executing the command + ```sh + pip install openai + ``` * [Download the sample code to your development PC.](/README.md#get-the-samples) * To tailor the sample to your configuration, use search and replace across the whole sample directory to update the following strings: diff --git a/samples/python/console/speech_synthesis_auto_style_sample.py b/samples/python/console/speech_synthesis_auto_style_sample.py new file mode 100644 index 000000000..6a2627672 --- /dev/null +++ b/samples/python/console/speech_synthesis_auto_style_sample.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python +# coding: utf-8 + +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. See LICENSE.md file in the project root for full license information. +""" +Speech synthesis sample with style auto prediction from GPT +""" + +try: + import azure.cognitiveservices.speech as speechsdk +except ImportError: + print(""" + Importing the Speech SDK for Python failed. + Refer to + https://docs.microsoft.com/azure/cognitive-services/speech-service/quickstart-text-to-speech-python for + installation instructions. + """) + import sys + sys.exit(1) + +import json +import os +import openai +openai.api_type = "azure" +openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT") +openai.api_version = "2023-05-15" +openai.api_key = os.getenv("AZURE_OPENAI_KEY") + +STYLE_PROMPT = """You are requested to split the following input into sentences and tag a correct style from {0} and `general`. +Please response with json format, e.g., [{{"sentence": text, "style": style }}] +Input: {1}""" + +SPEECH_REGION = os.getenv("SPEECH_REGION") +SPEECH_KEY = os.getenv("SPEECH_KEY") + +# Set the voice you want to use +VOICE = "en-US-JennyNeural" + +# paste or read the text you want to synthesize +TEXT = """Elizabeth Bennet had been obliged, by the scarcity of gentlemen, to sit down for two dances; and during part of that time, Mr. Darcy had been standing near enough for her to overhear a conversation between him and Mr. Bingley, who came from the dance for a few minutes to press his friend to join it. + +“Come, Darcy,” said he, “I must have you dance. I hate to see you standing about by yourself in this stupid manner. You had much better dance.” + +“I certainly shall not. You know how I detest it, unless I am particularly acquainted with my partner. At such an assembly as this, it would be insupportable. Your sisters are engaged, and there is not another woman in the room whom it would not be a punishment to me to stand up with.” + +“I would not be so fastidious as you are,” cried Bingley, “for a kingdom! Upon my honour, I never met with so many pleasant girls in my life as I have this evening; and there are several of them, you see, uncommonly pretty.” + +“You are dancing with the only handsome girl in the room,” said Mr. Darcy, looking at the eldest Miss Bennet. + +“Oh, she is the most beautiful creature I ever beheld! But there is one of her sisters sitting down just behind you, who is very pretty, and I dare say very agreeable. Do let me ask my partner to introduce you.”""" + + +if __name__ == "__main__": + speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SPEECH_REGION) + speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio24Khz96KBitRateMonoMp3) + speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None) + voices = speech_synthesizer.get_voices_async().get().voices + for v in voices: + if VOICE == v.name or VOICE == v.short_name: + styles = v.style_list + break + else: + print(f"The voice {VOICE} is not found") + exit(1) + + if len(styles) == 0: + print(f"The voice {VOICE} does not support styles") + exit(1) + + # refer to AOAI document: https://learn.microsoft.com/azure/cognitive-services/openai/chatgpt-quickstart?tabs=command-line&pivots=programming-language-python + aoi_response = openai.ChatCompletion.create( + engine="gpt-35-turbo", # engine = "deployment_name". + messages=[ + {"role": "system", "content": "You are an assistant."}, + {"role": "user", "content": STYLE_PROMPT.format(styles, TEXT)}, + ] + ) + + prediction = aoi_response['choices'][0]['message']['content'] + print("Style prediction: {}".format(prediction)) + prediction = json.loads(prediction) + # style elements in SSML: https://learn.microsoft.com/azure/cognitive-services/speech-service/speech-synthesis-markup-voice#style-and-degree-example + ssml = "{}" + style_elements = "" + for s in prediction: + style_elements += "{}".format(s["style"], s["sentence"]) + + ssml = ssml.format(VOICE, style_elements) + result = speech_synthesizer.speak_ssml_async(ssml).get() + if result.reason == speechsdk.ResultReason.Canceled: + print("Speech synthesis canceled: {}".format(result.cancellation_details.error_details)) + exit(1) + audio_data_stream = speechsdk.AudioDataStream(result) + # save the audio file + audio_data_stream.save_to_wav_file("output.mp3") From ba51ec9bc709c6a960c67b934c303f81bafe3748 Mon Sep 17 00:00:00 2001 From: Yulin Li Date: Sun, 9 Jul 2023 17:36:37 +0800 Subject: [PATCH 2/2] address comments --- samples/python/console/speech_synthesis_auto_style_sample.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/samples/python/console/speech_synthesis_auto_style_sample.py b/samples/python/console/speech_synthesis_auto_style_sample.py index 6a2627672..d20a4089e 100644 --- a/samples/python/console/speech_synthesis_auto_style_sample.py +++ b/samples/python/console/speech_synthesis_auto_style_sample.py @@ -37,7 +37,8 @@ # Set the voice you want to use VOICE = "en-US-JennyNeural" -# paste or read the text you want to synthesize +# Paste or read the text you want to synthesize +# The following demo text is from `The Project Gutenberg eBook of Pride and prejudice` (https://www.gutenberg.org/files/1342/1342-h/1342-h.htm) TEXT = """Elizabeth Bennet had been obliged, by the scarcity of gentlemen, to sit down for two dances; and during part of that time, Mr. Darcy had been standing near enough for her to overhear a conversation between him and Mr. Bingley, who came from the dance for a few minutes to press his friend to join it. “Come, Darcy,” said he, “I must have you dance. I hate to see you standing about by yourself in this stupid manner. You had much better dance.”