From e4acf977b0c000cd4ebc467fa8de9305bc87a1d0 Mon Sep 17 00:00:00 2001 From: Anthony Zhang Date: Sun, 3 Apr 2016 20:09:46 -0400 Subject: [PATCH] Doc improvements, fix non-16-bit AIFF loading on Python 2, improve PocketSphinx language install procedures --- README.rst | 36 +++++++++++++++++++++++++--------- reference/pocketsphinx.rst | 14 +++++++++++++ speech_recognition/__init__.py | 29 +++++++++++++++++++-------- 3 files changed, 62 insertions(+), 17 deletions(-) diff --git a/README.rst b/README.rst index b4eec911..58874a3f 100644 --- a/README.rst +++ b/README.rst @@ -64,7 +64,7 @@ See the ``examples/`` directory for usage examples: Installing ---------- -First, make sure you have all the requirements listed in the "Requirements" section. +First, make sure you have all the requirements listed in the "Requirements" section. The easiest way to install this is using ``pip install SpeechRecognition``. @@ -75,13 +75,20 @@ In the folder, run ``python setup.py install``. Requirements ------------ -In summary, this library requires: +To use all of the functionality of the library, you should have: -* **Python** 2.6, 2.7, or 3.3+ -* **PyAudio** 0.2.9+ (required only if you need to use microphone input) -* **PocketSphinx** (required only if you need to use the Sphinx recognizer) +* **Python** 2.6, 2.7, or 3.3+ (required) +* **PyAudio** 0.2.9+ (required only if you need to use microphone input, ``Microphone``) +* **PocketSphinx** (required only if you need to use the Sphinx recognizer, ``recognizer_instance.recognize_sphinx``) * **FLAC encoder** (required only if the system is not x86-based Windows/Linux/OS X) +The following requirements are optional, but can improve or extend functionality in some situations: + +* On Python 2, and only on Python 2, some functions (like ``recognizer_instance.recognize_bing``) will run slower if you do not have **Monotonic for Python 2** installed. +* If using CMU Sphinx, you may want to `install additional language packs `__ to support languages like International French or Mandarin Chinese. + +The following sections go over the details of each requirement. + Python ~~~~~~ @@ -90,7 +97,7 @@ The first software requirement is `Python 2.6, 2.7, or Python 3.3+ `__ is also necessary. Version 0.2.9+ is required in order to avoid overflow issues with recording on certain machines. +`PyAudio `__ is required if and only if you want to use microphone input (``Microphone``). PyAudio version 0.2.9+ is required, as earlier versions have overflow issues with recording on certain machines. If not installed, everything in the library will still work, except attempting to instantiate a ``Microphone`` object will throw an ``AttributeError``. @@ -107,7 +114,7 @@ PyAudio `wheel packages `__ for 64-bit Pytho PocketSphinx-Python (for Sphinx users) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -`PocketSphinx-Python `__ is required if and only if you want to use the Sphinx recognizer (``recognizer_instance.recognize_sphinx``). +`PocketSphinx-Python `__ is **required if and only if you want to use the Sphinx recognizer** (``recognizer_instance.recognize_sphinx``). PocketSphinx-Python `wheel packages `__ for 64-bit Python 2.7, 3.4, and 3.5 on Windows are included for convenience, under the ``third-party/`` directory. To install, simply run ``pip install wheel`` followed by ``pip install ./third-party/WHEEL_FILENAME`` (replace ``pip`` with ``pip3`` if using Python 3) in the SpeechRecognition folder. @@ -120,7 +127,7 @@ See `Notes on using PocketSphinx `__ is required to encode the audio data to send to the API. If using Windows (x86 or x86-64), OS X (Intel Macs only, OS X 10.6 or higher), or Linux (x86 or x86-64), the encoder is already bundled with this library - you do not need to install anything else. +A `FLAC encoder `__ is required to encode the audio data to send to the API. If using Windows (x86 or x86-64), OS X (Intel Macs only, OS X 10.6 or higher), or Linux (x86 or x86-64), this is **already bundled with this library - you do not need to install anything**. Otherwise, ensure that you have the ``flac`` command line tool, which is often available through the system package manager. @@ -141,10 +148,21 @@ The included ``flac-linux-x86`` executable is built from the `FLAC 1.3.1 source make exit # return to the original shell -The resulting executable can then be found at ``flac-1.3.1/src/flac`` in the build directory. A copy of the source code can also be found at ``third-party/flac-1.3.1.tar.xz``. +The resulting executable can then be found at ``./flac-1.3.1/src/flac`` relative to the working directory. A copy of the source code can also be found at ``third-party/flac-1.3.1.tar.xz``. The included ``flac-mac`` executable is extracted from `xACT 2.37 `__, which is a frontend for FLAC that conveniently includes binaries for all of its encoders. Specifically, it is a copy of ``xACT 2.37/xACT.app/Contents/Resources/flac`` in ``xACT2.37.zip``. +Monotonic for Python 2 (for faster operations in some functions on Python 2) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +On Python 2, and only on Python 2, if you do not install the `Monotonic for Python 2 `__ library, some functions will run slower than they otherwise could (though everything will still work correctly). + +On Python 3, that library's functionality is built into the Python standard library, which makes it unnecessary. + +This is because monotonic time is necessary to handle cache expiry properly in the face of system time changes and other time-related issues. If monotonic time functionality is not available, then things like access token requests will not be cached. + +To install, use `Pip `__: execute ``pip install monotonic`` in a terminal. + Troubleshooting --------------- diff --git a/reference/pocketsphinx.rst b/reference/pocketsphinx.rst index 1aa3c445..0d5d2946 100644 --- a/reference/pocketsphinx.rst +++ b/reference/pocketsphinx.rst @@ -11,6 +11,20 @@ By default, SpeechRecognition's Sphinx functionality supports only US English. A To install a language pack, download the ZIP archives and extract them directly into the module install directory (you can find the module install directory by running ``python -c "import speech_recognition as sr, os.path as p; print(p.dirname(sr.__file__))"``). +Here is a simple Bash script to install all of them: + +.. code:: bash + + #!/usr/bin/env bash + SR_LIB=$(python -c "import speech_recognition as sr, os.path as p; print(p.dirname(sr.__file__))") + sudo apt-get install --yes wget unzip + sudo wget https://db.tt/tVNcZXao -O "$SR_LIB/fr-FR.zip" + sudo unzip -o "$SR_LIB/fr-FR.zip" -d "$SR_LIB" + sudo chmod --recursive a+r "$SR_LIB/fr-FR/" + sudo wget https://db.tt/2YQVXmEk -O "$SR_LIB/zh-CN.zip" + sudo unzip -o "$SR_LIB/zh-CN.zip" -d "$SR_LIB" + sudo chmod --recursive a+r "$SR_LIB/zh-CN/" + Once installed, you can simply specify the language using the ``language`` parameter of ``recognizer_instance.recognize_sphinx``. For example, French would be specified with ``"fr-FR"`` and Mandarin with ``"zh-CN"``. Building PocketSphinx-Python from source diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 84cfaa5a..0083e9e5 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -3,7 +3,7 @@ """Library for performing speech recognition, with support for several engines and APIs, online and offline.""" __author__ = "Anthony Zhang (Uberi)" -__version__ = "3.4.1" +__version__ = "3.4.2" __license__ = "BSD" import io, os, subprocess, wave, aifc, base64 @@ -184,7 +184,12 @@ def __enter__(self): # run the FLAC converter with the FLAC data to get the AIFF data flac_converter = get_flac_converter() - process = subprocess.Popen([flac_converter, "--stdout", "--totally-silent", "--decode", "--force-aiff-format", "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) + process = subprocess.Popen([ + flac_converter, + "--stdout", "--totally-silent", # put the resulting AIFF file in stdout, and make sure it's not mixed with any program output + "--decode", "--force-aiff-format", # decode the FLAC file into an AIFF file + "-", # the input FLAC file contents will be given in stdin + ], stdin=subprocess.PIPE, stdout=subprocess.PIPE) aiff_data, stderr = process.communicate(flac_data) aiff_file = io.BytesIO(aiff_data) self.audio_reader = aifc.open(aiff_file, "rb") @@ -218,7 +223,7 @@ def read(self, size = -1): if hasattr(audioop, "byteswap"): # ``audioop.byteswap`` was only added in Python 3.4 buffer = audioop.byteswap(buffer, sample_width) else: # manually reverse the bytes of each sample, which is slower but works well enough as a fallback - buffer = buffer[sample_width - 1::-1] + b"".join(buffer[i + sample_width:i:-1] for i in range(1, len(buffer), sample_width)) + buffer = buffer[sample_width - 1::-1] + b"".join(buffer[i + sample_width:i:-1] for i in range(sample_width - 1, len(buffer), sample_width)) if self.audio_reader.getnchannels() != 1: # stereo audio buffer = audioop.tomono(buffer, sample_width, 1, 1) # convert stereo audio data to mono return buffer @@ -310,7 +315,7 @@ def get_aiff_data(self, convert_rate = None, convert_width = None): if hasattr(audioop, "byteswap"): # ``audioop.byteswap`` was only added in Python 3.4 raw_data = audioop.byteswap(raw_data, sample_width) else: # manually reverse the bytes of each sample, which is slower but works well enough as a fallback - raw_data = raw_data[sample_width - 1::-1] + b"".join(raw_data[i + sample_width:i:-1] for i in range(1, len(raw_data), sample_width)) + raw_data = raw_data[sample_width - 1::-1] + b"".join(raw_data[i + sample_width:i:-1] for i in range(sample_width - 1, len(raw_data), sample_width)) # generate the AIFF-C file contents with io.BytesIO() as aiff_file: @@ -338,7 +343,12 @@ def get_flac_data(self, convert_rate = None, convert_width = None): # run the FLAC converter with the WAV data to get the FLAC data wav_data = self.get_wav_data(convert_rate, convert_width) flac_converter = get_flac_converter() - process = subprocess.Popen([flac_converter, "--stdout", "--totally-silent", "--best", "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) + process = subprocess.Popen([ + flac_converter, + "--stdout", "--totally-silent", # put the resulting FLAC file in stdout, and make sure it's not mixed with any program output + "--best", # highest level of compression available + "-", # the input FLAC file contents will be given in stdin + ], stdin=subprocess.PIPE, stdout=subprocess.PIPE) flac_data, stderr = process.communicate(wav_data) return flac_data @@ -688,7 +698,10 @@ def recognize_bing(self, audio_data, key, language = "en-US", show_all = False): try: from time import monotonic # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+ except ImportError: - expire_time = None # monotonic time not available, don't cache access tokens + try: + from monotonic import monotonic # use time.monotonic backport for Python 2 if available (from https://pypi.python.org/pypi/monotonic) + except (ImportError, RuntimeError): + expire_time = None # monotonic time not available, don't cache access tokens if expire_time is None or monotonic() > expire_time: # first credential request, or the access token from the previous one expired # get an access token using OAuth credential_url = "https://oxford-speech.cloudapp.net/token/issueToken" @@ -891,7 +904,7 @@ def shutil_which(pgm): return p # backwards compatibility shims -WavFile = AudioFile +WavFile = AudioFile # WavFile was renamed to AudioFile in 3.4.1 def recognize_att(self, audio_data, app_key, app_secret, language = "en-US", show_all = False): authorization_url = "https://api.att.com/oauth/v4/token" authorization_body = "client_id={0}&client_secret={1}&grant_type=client_credentials&scope=SPEECH".format(app_key, app_secret) @@ -912,4 +925,4 @@ def recognize_att(self, audio_data, app_key, app_secret, language = "en-US", sho for entry in result["Recognition"]["NBest"]: if entry.get("Grade") == "accept" and "ResultText" in entry: return entry["ResultText"] raise UnknownValueError() # no transcriptions available -Recognizer.recognize_att = classmethod(recognize_att) +Recognizer.recognize_att = classmethod(recognize_att) # AT&T API is deprecated and shutting down as of 3.4.0