From 5a8636861b0b93af95c373b5093c27b12e84c1ec Mon Sep 17 00:00:00 2001 From: ftnext Date: Wed, 25 Dec 2024 18:01:49 +0900 Subject: [PATCH 1/9] [bugfix] ValueError: Unknown field for RecognitionConfig --- speech_recognition/recognizers/google_cloud.py | 2 +- tests/recognizers/test_google_cloud.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py index 23c1b4e5..ed1e93f3 100644 --- a/speech_recognition/recognizers/google_cloud.py +++ b/speech_recognition/recognizers/google_cloud.py @@ -79,7 +79,7 @@ def recognize( **api_params, } if preferred_phrases is not None: - config["speechContexts"] = [ + config["speech_contexts"] = [ speech.SpeechContext(phrases=preferred_phrases) ] if show_all: diff --git a/tests/recognizers/test_google_cloud.py b/tests/recognizers/test_google_cloud.py index c450d14f..1f0c74f3 100644 --- a/tests/recognizers/test_google_cloud.py +++ b/tests/recognizers/test_google_cloud.py @@ -4,6 +4,7 @@ RecognitionAudio, RecognitionConfig, RecognizeResponse, + SpeechContext, SpeechRecognitionAlternative, SpeechRecognitionResult, WordInfo, @@ -165,6 +166,7 @@ def test_transcribe_with_specified_api_parameters(SpeechClient): MagicMock(spec=Recognizer), audio_data, language="ja-JP", + preferred_phrases=["numero", "hoge"], use_enhanced=True, ) @@ -173,6 +175,7 @@ def test_transcribe_with_specified_api_parameters(SpeechClient): encoding=RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16_000, language_code="ja-JP", + speech_contexts=[SpeechContext(phrases=["numero", "hoge"])], use_enhanced=True, ), audio=RecognitionAudio(content=b"flac_data"), From b9fe3316d1bde8de61958d23ec89008b8b2787e2 Mon Sep 17 00:00:00 2001 From: ftnext Date: Wed, 25 Dec 2024 18:04:50 +0900 Subject: [PATCH 2/9] [bugfix] DO NOT use assert as data validation --- speech_recognition/recognizers/google_cloud.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py index ed1e93f3..13cfa977 100644 --- a/speech_recognition/recognizers/google_cloud.py +++ b/speech_recognition/recognizers/google_cloud.py @@ -11,7 +11,7 @@ def recognize( audio_data: AudioData, credentials_json_path: str | None = None, language: str = "en-US", - preferred_phrases=None, + preferred_phrases: list[str] | None = None, show_all: bool = False, **api_params, ): @@ -38,15 +38,6 @@ def recognize( Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection. """ - assert isinstance( - audio_data, AudioData - ), "``audio_data`` must be audio data" - assert isinstance(language, str), "``language`` must be a string" - assert preferred_phrases is None or all( - isinstance(preferred_phrases, (type(""), type(""))) - for preferred_phrases in preferred_phrases - ), "``preferred_phrases`` must be a list of strings" - try: from google.api_core.exceptions import GoogleAPICallError from google.cloud import speech From 6689acc6a7c9d4fd5ba22d6ac29025d59e72bf5e Mon Sep 17 00:00:00 2001 From: ftnext Date: Wed, 25 Dec 2024 18:07:06 +0900 Subject: [PATCH 3/9] [refactor] Rename same as API parameter --- speech_recognition/recognizers/google_cloud.py | 6 +++--- tests/recognizers/test_google_cloud.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py index 13cfa977..18816121 100644 --- a/speech_recognition/recognizers/google_cloud.py +++ b/speech_recognition/recognizers/google_cloud.py @@ -10,7 +10,7 @@ def recognize( recognizer, audio_data: AudioData, credentials_json_path: str | None = None, - language: str = "en-US", + language_code: str = "en-US", preferred_phrases: list[str] | None = None, show_all: bool = False, **api_params, @@ -19,7 +19,7 @@ def recognize( This function requires a Google Cloud Platform account; see the `Google Cloud Speech API Quickstart `__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project, and set up Service Account Key credentials for the project. The result is a JSON file containing the API credentials. The text content of this JSON file is specified by ``credentials_json``. If not specified, the library will try to automatically `find the default API credentials JSON file `__. - The recognition language is determined by ``language``, which is a BCP-47 language tag like ``"en-US"`` (US English). A list of supported language tags can be found in the `Google Cloud Speech API documentation `__. + The recognition language is determined by ``language_code``, which is a BCP-47 language tag like ``"en-US"`` (US English). A list of supported language tags can be found in the `Google Cloud Speech API documentation `__. If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. Note that the API imposes certain `restrictions on the list of phrase strings `__. @@ -66,7 +66,7 @@ def recognize( config = { "encoding": speech.RecognitionConfig.AudioEncoding.FLAC, "sample_rate_hertz": audio_data.sample_rate, - "language_code": language, + "language_code": language_code, **api_params, } if preferred_phrases is not None: diff --git a/tests/recognizers/test_google_cloud.py b/tests/recognizers/test_google_cloud.py index 1f0c74f3..cca80aec 100644 --- a/tests/recognizers/test_google_cloud.py +++ b/tests/recognizers/test_google_cloud.py @@ -165,7 +165,7 @@ def test_transcribe_with_specified_api_parameters(SpeechClient): _ = recognize( MagicMock(spec=Recognizer), audio_data, - language="ja-JP", + language_code="ja-JP", preferred_phrases=["numero", "hoge"], use_enhanced=True, ) From ce9f12bb5f46949541e48d4732ec13e32c0d6dfe Mon Sep 17 00:00:00 2001 From: ftnext Date: Wed, 25 Dec 2024 18:35:05 +0900 Subject: [PATCH 4/9] [refactor] Extract logic to build config --- .../recognizers/google_cloud.py | 73 ++++++++++++++----- 1 file changed, 53 insertions(+), 20 deletions(-) diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py index 18816121..21423ae1 100644 --- a/speech_recognition/recognizers/google_cloud.py +++ b/speech_recognition/recognizers/google_cloud.py @@ -1,19 +1,66 @@ from __future__ import annotations +from typing import TYPE_CHECKING, TypedDict from urllib.error import URLError from speech_recognition.audio import AudioData from speech_recognition.exceptions import RequestError, UnknownValueError +if TYPE_CHECKING: + from google.cloud.speech_v1.types import RecognitionConfig, SpeechContext + from typing_extensions import Required + + +class GoogleCloudRecognizerParameters(TypedDict, total=False): + # SpeechRecognition specific parameters + preferred_phrases: list[str] + show_all: bool + + # Speech-to-Text V1 API's parameters + language_code: str + use_enhanced: bool + # TODO Add others support + + +class GoogleCloudSpeechV1Parameters(TypedDict, total=False): + """Speech-to-Text V1 API's parameters. + + https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig + """ + + encoding: Required[RecognitionConfig.AudioEncoding] + sample_rate_hertz: Required[int] + language_code: Required[str] + speech_contexts: list[SpeechContext] + enable_word_time_offsets: bool + use_enhanced: bool + + +def _build_config( + audio_data: AudioData, recognizer_params: GoogleCloudRecognizerParameters +) -> RecognitionConfig: + from google.cloud import speech + + parameters: GoogleCloudSpeechV1Parameters = { + "encoding": speech.RecognitionConfig.AudioEncoding.FLAC, + "sample_rate_hertz": audio_data.sample_rate, + "language_code": recognizer_params.pop("language_code", "en-US"), + } + if preferred_phrases := recognizer_params.pop("preferred_phrases", None): + parameters["speech_contexts"] = [ + speech.SpeechContext(phrases=preferred_phrases) + ] + if recognizer_params.pop("show_all", False): + # ref: https://cloud.google.com/speech-to-text/docs/async-time-offsets + parameters["enable_word_time_offsets"] = True + return speech.RecognitionConfig(**(parameters | recognizer_params)) + def recognize( recognizer, audio_data: AudioData, credentials_json_path: str | None = None, - language_code: str = "en-US", - preferred_phrases: list[str] | None = None, - show_all: bool = False, - **api_params, + **kwargs: GoogleCloudRecognizerParameters, ): """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API. @@ -63,21 +110,7 @@ def recognize( ) audio = speech.RecognitionAudio(content=flac_data) - config = { - "encoding": speech.RecognitionConfig.AudioEncoding.FLAC, - "sample_rate_hertz": audio_data.sample_rate, - "language_code": language_code, - **api_params, - } - if preferred_phrases is not None: - config["speech_contexts"] = [ - speech.SpeechContext(phrases=preferred_phrases) - ] - if show_all: - # ref: https://cloud.google.com/speech-to-text/docs/async-time-offsets - config["enable_word_time_offsets"] = True - - config = speech.RecognitionConfig(**config) + config = _build_config(audio_data, kwargs.copy()) try: response = client.recognize(config=config, audio=audio) @@ -88,7 +121,7 @@ def recognize( "recognition connection failed: {0}".format(e.reason) ) - if show_all: + if kwargs.get("show_all"): return response if len(response.results) == 0: raise UnknownValueError() From d6afb5367ca6a0833fc571366d905afc05d4d5f3 Mon Sep 17 00:00:00 2001 From: ftnext Date: Wed, 25 Dec 2024 19:03:23 +0900 Subject: [PATCH 5/9] [feat] Migrate to Google ADC (not paste credential JSON) --- examples/audio_transcribe.py | 4 ++-- examples/extended_results.py | 4 ++-- examples/microphone_recognition.py | 4 ++-- examples/special_recognizer_features.py | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/audio_transcribe.py b/examples/audio_transcribe.py index 7806023f..ec14614d 100644 --- a/examples/audio_transcribe.py +++ b/examples/audio_transcribe.py @@ -33,9 +33,9 @@ print("Could not request results from Google Speech Recognition service; {0}".format(e)) # recognize speech using Google Cloud Speech -GOOGLE_CLOUD_SPEECH_CREDENTIALS = r"""INSERT THE CONTENTS OF THE GOOGLE CLOUD SPEECH JSON CREDENTIALS FILE HERE""" +# Before run, create local authentication credentials (``gcloud auth application-default login``) try: - print("Google Cloud Speech thinks you said " + r.recognize_google_cloud(audio, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS)) + print("Google Cloud Speech thinks you said " + r.recognize_google_cloud(audio)) except sr.UnknownValueError: print("Google Cloud Speech could not understand audio") except sr.RequestError as e: diff --git a/examples/extended_results.py b/examples/extended_results.py index 599c67f2..f65061ed 100644 --- a/examples/extended_results.py +++ b/examples/extended_results.py @@ -37,10 +37,10 @@ print("Could not request results from Google Speech Recognition service; {0}".format(e)) # recognize speech using Google Cloud Speech -GOOGLE_CLOUD_SPEECH_CREDENTIALS = r"""INSERT THE CONTENTS OF THE GOOGLE CLOUD SPEECH JSON CREDENTIALS FILE HERE""" +# Before run, create local authentication credentials (``gcloud auth application-default login``) try: print("Google Cloud Speech recognition results:") - pprint(r.recognize_google_cloud(audio, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS, show_all=True)) # pretty-print the recognition result + pprint(r.recognize_google_cloud(audio, show_all=True)) # pretty-print the recognition result except sr.UnknownValueError: print("Google Cloud Speech could not understand audio") except sr.RequestError as e: diff --git a/examples/microphone_recognition.py b/examples/microphone_recognition.py index a4f10a9b..e864e2a4 100644 --- a/examples/microphone_recognition.py +++ b/examples/microphone_recognition.py @@ -32,9 +32,9 @@ print("Could not request results from Google Speech Recognition service; {0}".format(e)) # recognize speech using Google Cloud Speech -GOOGLE_CLOUD_SPEECH_CREDENTIALS = r"""INSERT THE CONTENTS OF THE GOOGLE CLOUD SPEECH JSON CREDENTIALS FILE HERE""" +# Before run, create local authentication credentials (``gcloud auth application-default login``) try: - print("Google Cloud Speech thinks you said " + r.recognize_google_cloud(audio, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS)) + print("Google Cloud Speech thinks you said " + r.recognize_google_cloud(audio)) except sr.UnknownValueError: print("Google Cloud Speech could not understand audio") except sr.RequestError as e: diff --git a/examples/special_recognizer_features.py b/examples/special_recognizer_features.py index f4365297..1d051ede 100644 --- a/examples/special_recognizer_features.py +++ b/examples/special_recognizer_features.py @@ -35,11 +35,11 @@ # recognize preferred phrases using Google Cloud Speech -GOOGLE_CLOUD_SPEECH_CREDENTIALS = r"""INSERT THE CONTENTS OF THE GOOGLE CLOUD SPEECH JSON CREDENTIALS FILE HERE""" +# Before run, create local authentication credentials (``gcloud auth application-default login``) try: print("Google Cloud Speech recognition for \"numero\" with different sets of preferred phrases:") - print(r.recognize_google_cloud(audio_fr, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS, preferred_phrases=["noomarow"])) - print(r.recognize_google_cloud(audio_fr, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS, preferred_phrases=["newmarrow"])) + print(r.recognize_google_cloud(audio_fr, preferred_phrases=["noomarow"])) + print(r.recognize_google_cloud(audio_fr, preferred_phrases=["newmarrow"])) except sr.UnknownValueError: print("Google Cloud Speech could not understand audio") except sr.RequestError as e: From 43a440aa1cb3a4b0bbc684d9007b278462f6c2d2 Mon Sep 17 00:00:00 2001 From: ftnext Date: Wed, 25 Dec 2024 19:04:28 +0900 Subject: [PATCH 6/9] [docs] Refine docs * Update links --- .../recognizers/google_cloud.py | 45 +++++++++++-------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py index 21423ae1..cd6ccdd4 100644 --- a/speech_recognition/recognizers/google_cloud.py +++ b/speech_recognition/recognizers/google_cloud.py @@ -7,17 +7,38 @@ from speech_recognition.exceptions import RequestError, UnknownValueError if TYPE_CHECKING: - from google.cloud.speech_v1.types import RecognitionConfig, SpeechContext + from google.cloud.speech_v1.types import ( + RecognitionConfig, + RecognizeResponse, + SpeechContext, + ) from typing_extensions import Required class GoogleCloudRecognizerParameters(TypedDict, total=False): + """Optional parameters. + + The recognition language is determined by ``language_code``, which is a BCP-47 language tag like ``"en-US"`` (US English). Default: ``"en-US"``. + A list of supported language tags can be found in the `Speech-to-Text supported languages `__. + + If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. + This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. + Note that the API imposes certain `restrictions on the list of phrase strings `__. + + ``show_all``: See :py:func:`recognize`. + + ``model``: You can select the model to get best results. (See `RecognitionConfig's documentation `__ for detail) + + ``use_enhanced``: Set to true to use an enhanced model for speech recognition. + """ + # SpeechRecognition specific parameters preferred_phrases: list[str] show_all: bool # Speech-to-Text V1 API's parameters language_code: str + model: str use_enhanced: bool # TODO Add others support @@ -33,6 +54,7 @@ class GoogleCloudSpeechV1Parameters(TypedDict, total=False): language_code: Required[str] speech_contexts: list[SpeechContext] enable_word_time_offsets: bool + model: str use_enhanced: bool @@ -61,27 +83,14 @@ def recognize( audio_data: AudioData, credentials_json_path: str | None = None, **kwargs: GoogleCloudRecognizerParameters, -): +) -> str | RecognizeResponse: """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API. - This function requires a Google Cloud Platform account; see the `Google Cloud Speech API Quickstart `__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project, and set up Service Account Key credentials for the project. The result is a JSON file containing the API credentials. The text content of this JSON file is specified by ``credentials_json``. If not specified, the library will try to automatically `find the default API credentials JSON file `__. - - The recognition language is determined by ``language_code``, which is a BCP-47 language tag like ``"en-US"`` (US English). A list of supported language tags can be found in the `Google Cloud Speech API documentation `__. - - If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. Note that the API imposes certain `restrictions on the list of phrase strings `__. - - ``api_params`` are Cloud Speech API-specific parameters as dict (optional). For more information see - - The ``use_enhanced`` is a boolean option. If use_enhanced is set to true and the model field is not set, - then an appropriate enhanced model is chosen if an enhanced model exists for the audio. - If use_enhanced is true and an enhanced version of the specified model does not exist, - then the speech is recognized using the standard version of the specified model. - - Furthermore, if the option ``use_enhanced`` has not been set the option ``model`` can be used, which can be used to select the model best - suited to your domain to get best results. If a model is not explicitly specified, - then we auto-select a model based on the other parameters of this method. + This function requires a Google Cloud Platform account; see the `Set up Speech-to-Text `__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project. + And create local authentication credentials for your user account. The result is a JSON file containing the API credentials. You can specify the JSON file by ``credentials_json_path``. If not specified, the library will try to automatically `find the default API credentials JSON file `__. Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary. + For other parameters, see :py:class:`GoogleCloudRecognizerParameters`. Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection. """ From 9737a760a71c4f13fffa7330195782b22310a8f1 Mon Sep 17 00:00:00 2001 From: ftnext Date: Wed, 25 Dec 2024 19:06:31 +0900 Subject: [PATCH 7/9] [docs] Add link to setup Google Cloud project --- README.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index da70104d..7bb0c0cf 100644 --- a/README.rst +++ b/README.rst @@ -160,8 +160,9 @@ You can install it with :command:`python3 -m pip install SpeechRecognition[googl **Prerequisite**: Create local authentication credentials for your Google account -* `Before you begin (Transcribe speech to text by using client libraries) `__ -* Detail: `User credentials (Set up ADC for a local development environment) `__ +* Digest: `Before you begin (Transcribe speech to text by using client libraries) `__ +* `Set up Speech-to-Text `__ +* `User credentials (Set up ADC for a local development environment) `__ Currently only `V1 `__ is supported. (`V2 `__ is not supported) From 49dc9ad430e17508328758f937eb3f5c814e360a Mon Sep 17 00:00:00 2001 From: ftnext Date: Wed, 25 Dec 2024 19:12:17 +0900 Subject: [PATCH 8/9] [docs] Update signature --- reference/library-reference.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reference/library-reference.rst b/reference/library-reference.rst index 82239fd2..61370837 100644 --- a/reference/library-reference.rst +++ b/reference/library-reference.rst @@ -227,8 +227,8 @@ Returns the most likely transcription if ``show_all`` is false (the default). Ot Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. -``recognizer_instance.recognize_google_cloud(audio_data: AudioData, credentials_json_path: Union[str, None] = None, language: str = "en-US", preferred_phrases: Union[Iterable[str], None] = None, show_all: bool = False, **api_params) -> Union[str, Dict[str, Any]]`` ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +``recognizer_instance.recognize_google_cloud(audio_data: AudioData, credentials_json_path: Union[str, None] = None, **kwargs) -> Union[str, Dict[str, Any]]`` +------------------------------------------------------------------------------------------------------------------------------------------------------------- .. autofunction:: speech_recognition.recognizers.google_cloud.recognize From 53dccb542e5d8cb0eaf7667a4a939835cba66c25 Mon Sep 17 00:00:00 2001 From: ftnext Date: Wed, 25 Dec 2024 19:22:21 +0900 Subject: [PATCH 9/9] [refactor] Tweak import --- speech_recognition/recognizers/google_cloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py index cd6ccdd4..5c5a7f62 100644 --- a/speech_recognition/recognizers/google_cloud.py +++ b/speech_recognition/recognizers/google_cloud.py @@ -7,7 +7,7 @@ from speech_recognition.exceptions import RequestError, UnknownValueError if TYPE_CHECKING: - from google.cloud.speech_v1.types import ( + from google.cloud.speech import ( RecognitionConfig, RecognizeResponse, SpeechContext,