diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 00000000..3ba13e0c --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: false diff --git a/.github/ISSUE_TEMPLATE/issue_template.md b/.github/ISSUE_TEMPLATE/issue_template.md new file mode 100644 index 00000000..f0295f4b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/issue_template.md @@ -0,0 +1,46 @@ +--- +name: Issue +about: Use this template for bug reports (and so on). +--- + +Steps to reproduce +------------------ + +1. (How do you make the issue happen? Does it happen every time you try it?) +2. (Make sure to go into as much detail as needed to reproduce the issue. Posting your code here can help us resolve the problem much faster!) +3. (If there are any files, like audio recordings, don't forget to include them.) + +Expected behaviour +------------------ + +(What did you expect to happen?) + +Actual behaviour +---------------- + +(What happened instead? How is it different from what you expected?) + +``` +(If the library threw an exception, paste the full stack trace here) +``` + +System information +------------------ + +(Delete all the statements that don't apply.) + +My **system** is . (For example, "Ubuntu 16.04 LTS x64", "Windows 10 x64", or "macOS Sierra".) + +My **Python version** is . (You can check this by running `python -V`.) + +My **Pip version** is . (You can check this by running `pip -V`.) + +My **SpeechRecognition library version** is . (You can check this by running `python -c "import speech_recognition as sr;print(sr.__version__)"`.) + +My **PyAudio library version** is / I don't have PyAudio installed. (You can check this by running `python -c "import pyaudio as p;print(p.__version__)"`.) + +My **microphones** are: (You can check this by running `python -c "import speech_recognition as sr;print(sr.Microphone.list_microphone_names())"`.) + +My **working microphones** are: (You can check this by running `python -c "import speech_recognition as sr;print(sr.Microphone.list_working_microphones())"`.) + +I **installed PocketSphinx from** . (For example, from the Debian repositories, from Homebrew, or from the source code.) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 00000000..6a4291f1 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,18 @@ +name: Static analysis + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + lint: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Run flake8 + run: make lint diff --git a/.github/workflows/rstcheck.yml b/.github/workflows/rstcheck.yml new file mode 100644 index 00000000..c27a54cd --- /dev/null +++ b/.github/workflows/rstcheck.yml @@ -0,0 +1,18 @@ +name: Ensure RST is well-formed + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + rstcheck: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Run rstcheck + run: make rstcheck diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml new file mode 100644 index 00000000..ed5fe9cb --- /dev/null +++ b/.github/workflows/unittests.yml @@ -0,0 +1,61 @@ +name: Unit tests + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + build: + strategy: + fail-fast: true + matrix: + include: + - os: ubuntu-latest + python-version: "3.9" + - os: ubuntu-latest + python-version: "3.10" + - os: ubuntu-latest + python-version: "3.11" + - os: ubuntu-latest + python-version: "3.12" + - os: ubuntu-latest + python-version: "3.13" + - os: windows-latest + python-version: "3.11" + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install build dependencies + if: matrix.os == 'ubuntu-latest' + run: | + sudo apt-get update + # For pocketsphinx (already installed: swig) + sudo apt-get install --no-install-recommends -y libpulse-dev libasound2-dev + # For PyAudio + sudo apt-get install --no-install-recommends -y portaudio19-dev + - name: Install ffmpeg (for Whisper) + uses: FedericoCarboni/setup-ffmpeg@v3 + - name: Install Python dependencies (Ubuntu, <=3.12) + if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.13' + run: | + python -m pip install .[dev,audio,pocketsphinx,google-cloud,whisper-local,faster-whisper,openai,groq] + - name: Install Python dependencies (Ubuntu, 3.13) + if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.13' + run: | + python -m pip install standard-aifc setuptools + python -m pip install --no-build-isolation .[dev,audio,pocketsphinx,google-cloud,openai,groq] + - name: Install Python dependencies (Windows) + if: matrix.os == 'windows-latest' + run: | + python -m pip install .[dev,whisper-local,faster-whisper,google-cloud,openai,groq] + - name: Test with unittest + run: | + pytest --doctest-modules -v speech_recognition/recognizers/ tests/ diff --git a/.gitignore b/.gitignore index a36ce97c..93e8d09f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,176 @@ -*.egg-info -build -dist -__pycache__ -*.pyc +# ----- Python.gitignore ----- +# from: https://github.com/github/gitignore/blob/main/Python.gitignore + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# ----- End of Python.gitignore ----- + +# SpeechRecognition specific .gitignore speech_recognition/pocketsphinx-data/fr-FR/ speech_recognition/pocketsphinx-data/zh-CN/ fr-FR.zip zh-CN.zip +it-IT.zip pocketsphinx-python/ +examples/TEST.py +*.geany +*.out diff --git a/LICENSE-FLAC.txt b/LICENSE-FLAC.txt new file mode 100644 index 00000000..d159169d --- /dev/null +++ b/LICENSE-FLAC.txt @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/LICENSE.txt b/LICENSE.txt index 386f7573..1385f6fe 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,4 +1,4 @@ -Copyright (c) 2014-2016, Anthony Zhang +Copyright (c) 2014-, Anthony Zhang All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/MANIFEST.in b/MANIFEST.in index 9fd40e7c..4071da54 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,6 @@ graft speech_recognition +graft reference recursive-exclude speech_recognition *.pyc include README.rst include LICENSE.txt +include LICENSE-FLAC.txt diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..93cef8fd --- /dev/null +++ b/Makefile @@ -0,0 +1,17 @@ +lint: +# ignore errors for long lines and multi-statement lines + @pipx run flake8 --ignore=E501,E701,W503 --extend-exclude .venv,venv,build --doctests . + +rstcheck: +# PyPI does not support Sphinx directives and roles + @pipx run rstcheck README.rst + @pipx run rstcheck[sphinx] --ignore-directives autofunction reference/*.rst + +distribute: + @pipx run build + @pipx run twine check dist/* + +publish: +# Set PYPI_API_TOKEN before `make publish` + @test -n "${PYPI_API_TOKEN}" + @pipx run twine upload -u __token__ -p ${PYPI_API_TOKEN} dist/* diff --git a/README.rst b/README.rst index e574be4f..7fc072ae 100644 --- a/README.rst +++ b/README.rst @@ -1,9 +1,5 @@ -Speech Recognition -================== - -.. image:: https://img.shields.io/pypi/dm/SpeechRecognition.svg - :target: https://pypi.python.org/pypi/SpeechRecognition/ - :alt: Downloads +SpeechRecognition +================= .. image:: https://img.shields.io/pypi/v/SpeechRecognition.svg :target: https://pypi.python.org/pypi/SpeechRecognition/ @@ -21,43 +17,67 @@ Speech Recognition :target: https://pypi.python.org/pypi/SpeechRecognition/ :alt: License -Library for performing speech recognition with support for Google Speech Recognition, `Wit.ai `__, `IBM Speech to Text `__, and `AT&T Speech to Text `__. +.. image:: https://api.travis-ci.org/Uberi/speech_recognition.svg?branch=master + :target: https://travis-ci.org/Uberi/speech_recognition + :alt: Continuous Integration Test Results -Links: +Library for performing speech recognition, with support for several engines and APIs, online and offline. -- `PyPI `__ -- `GitHub `__ +**UPDATE 2022-02-09**: Hey everyone! This project started as a tech demo, but these days it needs more time than I have to keep up with all the PRs and issues. Therefore, I'd like to put out an **open invite for collaborators** - just reach out at me@anthonyz.ca if you're interested! -Quickstart: ``pip install SpeechRecognition``. See the "Installing" section for more details. +Speech recognition engine/API support: + +* `CMU Sphinx `__ (works offline) +* Google Speech Recognition +* `Google Cloud Speech API `__ +* `Wit.ai `__ +* `Microsoft Azure Speech `__ +* `Microsoft Bing Voice Recognition (Deprecated) `__ +* `Houndify API `__ +* `IBM Speech to Text `__ +* `Snowboy Hotword Detection `__ (works offline) +* `Tensorflow `__ +* `Vosk API `__ (works offline) +* `OpenAI whisper `__ (works offline) +* `OpenAI Whisper API `__ +* `Groq Whisper API `__ + +**Quickstart:** ``pip install SpeechRecognition``. See the "Installing" section for more details. To quickly try it out, run ``python -m speech_recognition`` after installing. -How to cite this library (APA style): +Project links: - Zhang, A. (2016). Speech Recognition (Version 3.1) [Software]. Available from https://github.com/Uberi/speech_recognition#readme. +- `PyPI `__ +- `Source code `__ +- `Issue tracker `__ -How to cite this library (Chicago style): +Library Reference +----------------- + +The `library reference `__ documents every publicly accessible object in the library. This document is also included under ``reference/library-reference.rst``. - Zhang, Anthony. 2016. *Speech Recognition* (version 3.1). +See `Notes on using PocketSphinx `__ for information about installing languages, compiling PocketSphinx, and building language packs from online resources. This document is also included under ``reference/pocketsphinx.rst``. -Also check out the `Python Baidu Yuyin API `__, which is based on an older version of this project, and adds support for `Baidu Yuyin `__. +You have to install Vosk models for using Vosk. `Here `__ are models avaiable. You have to place them in models folder of your project, like "your-project-folder/models/your-vosk-model" Examples -------- -See the ``examples/`` directory for usage examples: +See the ``examples/`` `directory `__ in the repository root for usage examples: - `Recognize speech input from the microphone `__ -- `Transcribe a WAV audio file `__ -- `Save audio data to a WAV file `__ +- `Transcribe an audio file `__ +- `Save audio data to an audio file `__ - `Show extended recognition results `__ - `Calibrate the recognizer energy threshold for ambient noise levels `__ (see ``recognizer_instance.energy_threshold`` for details) - `Listening to a microphone in the background `__ +- `Various other useful recognizer features `__ Installing ---------- -First, make sure you have all the requirements listed in the "Requirements" section. +First, make sure you have all the requirements listed in the "Requirements" section. The easiest way to install this is using ``pip install SpeechRecognition``. @@ -68,542 +88,320 @@ In the folder, run ``python setup.py install``. Requirements ------------ -Python -~~~~~~ - -The first software requirement is `Python 2.6, 2.7, or Python 3.3+ `__. This is required to use the library. - -PyAudio (for microphone users) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If you want to use audio input from microphones, `PyAudio `__ is also necessary. If not installed, the library will still work, but ``Microphone`` will not be defined. - -The installation instructions are quite good as of PyAudio v0.2.9. For convenience, they are summarized below: - -* On Windows, install PyAudio using `Pip `__: execute ``pip install pyaudio`` in a terminal. -* On Debian-derived Linux distributions (like Ubuntu and Mint), install PyAudio using `APT `__: execute ``sudo apt-get install python-pyaudio python3-pyaudio`` in a terminal. - * If you want to use the latest version of PyAudio rather than the version in the repositories, you can install the latest release using Pip: execute ``sudo apt-get install portaudio19-dev python-all-dev python3-all-dev && pip install pyaudio`` (replace ``pip`` with ``pip3`` if using Python 3). -* On OS X, install PortAudio using `Homebrew `__: ``brew install portaudio``. Then, install PyAudio using `Pip `__: ``pip install pyaudio``. -* On other POSIX-based systems, install the ``portaudio19-dev`` and ``python-all-dev`` (or ``python3-all-dev`` if using Python 3) packages (or their closest equivalents) using a package manager of your choice, and then install PyAudio using `Pip `__: ``pip install pyaudio`` (replace ``pip`` with ``pip3`` if using Python 3). - -PyAudio `wheel packages `__ for 64-bit Python 2.7, 3.4, and 3.5 on Windows and Linux are included for convenience, under the ``third-party/`` directory. To install, simply run ``pip install wheel`` followed by ``pip install ./third-party/WHEEL_FILENAME`` (replace ``pip`` with ``pip3`` if using Python 3) in the SpeechRecognition folder. - -PocketSphinx-Python (for Sphinx users) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -`PocketSphinx-Python `__ is required if and only if you want to use the Sphinx recognizer (``recognizer_instance.recognize_sphinx``). +To use all of the functionality of the library, you should have: -PocketSphinx-Python `wheel packages `__ for 64-bit Python 2.7, 3.4, and 3.5 on Windows and Linux are included for convenience, under the ``third-party/`` directory. To install, simply run ``pip install wheel`` followed by ``pip install ./third-party/WHEEL_FILENAME`` (replace ``pip`` with ``pip3`` if using Python 3) in the SpeechRecognition folder. +* **Python** 3.9+ (required) +* **PyAudio** 0.2.11+ (required only if you need to use microphone input, ``Microphone``) +* **PocketSphinx** (required only if you need to use the Sphinx recognizer, ``recognizer_instance.recognize_sphinx``) +* **Google API Client Library for Python** (required only if you need to use the Google Cloud Speech API, ``recognizer_instance.recognize_google_cloud``) +* **FLAC encoder** (required only if the system is not x86-based Windows/Linux/OS X) +* **Vosk** (required only if you need to use Vosk API speech recognition ``recognizer_instance.recognize_vosk``) +* **Whisper** (required only if you need to use Whisper ``recognizer_instance.recognize_whisper``) +* **Faster Whisper** (required only if you need to use Faster Whisper ``recognizer_instance.recognize_faster_whisper``) +* **openai** (required only if you need to use OpenAI Whisper API speech recognition ``recognizer_instance.recognize_openai``) +* **groq** (required only if you need to use Groq Whisper API speech recognition ``recognizer_instance.recognize_groq``) -Note that the versions available in most package repositories are outdated and will not work with the bundled language data. Using the bundled wheel packages or building from source is recommended. - -Installing other languages -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -By default, SpeechRecognition's Sphinx functionality supports only US English. Additional language packs are also available, but not included due to the files being too large: - -* `Metropolitan French `__ -* `Mandarin Chinese `__ - -To install a language pack, download the ZIP archives and extract them directly into the module install directory (you can find the module install directory by running ``python -c "import speech_recognition as sr, os.path as p; print(p.dirname(sr.__file__))"``). - -Once installed, you can simply specify the language using the ``language`` parameter of ``recognizer_instance.recognize_sphinx``. For example, French would be specified with ``"fr-FR"`` and Mandarin with ``"zh-CN"``. - -Building PocketSphinx-Python from source -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -* On Windows: - 1. Install `Python `__, `Pip `__, `SWIG `__, and `Git `__, preferably using a package manager. - 2. Install the necessary `compilers suite `__ (`here's a PDF version `__ in case the link goes down) for compiling modules for your particular Python version: - * `Microsoft Visual C++ Compiler for Python 2.7 `__ for Python 2.7. - * `Visual Studio 2015 Community Edition `__ for Python 3.5. - * The installation process for Python 3.4 is outlined in the article above - 3. Add the folders containing the Python, SWIG, and Git binaries to your ``PATH`` environment variable. - 4. Reboot to apply changes. - 5. If not using Python 2.7, install PocketSphinx using Pip: execute ``pip install pocketsphinx`` in a terminal. Otherwise: - 1. Download the full PocketSphinx-Python source code by running ``git clone --recursive https://github.com/bambocher/pocketsphinx-python``. - 2. Download [msinttypes](https://code.google.com/archive/p/msinttypes/) and copy ``inttypes.h`` and ``stdint.h`` from it into the ``sphinxbase/include/sphinxbase`` folder under the project root folder. This is necessary because the MSVC compiler version used for Python 2.7 is missing a lot of C99 features; msinttypes implements the important ones that Sphinx needs. - 3. Run ``python setup.py install`` to compile and install PocketSphinx. -* On any Debian-derived Linux distributions (like Ubuntu and Mint): - 1. Run ``sudo apt-get install python python-all-dev python-pip build-essential swig git`` for Python 2, or ``sudo apt-get install python3 python3-all-dev python3-pip build-essential swig git`` for Python 3. - 2. Run ``pip install pocketsphinx`` for Python 2, or ``pip3 install pocketsphinx`` for Python 3. -* On other POSIX-based systems: - 1. Install `Python `__, `Pip `__, `SWIG `__, and `Git `__, preferably using a package manager. - 2. Install PocketSphinx-Python using Pip: ``pip install pocketsphinx``. - -To build an installable `wheel package `__ (like the ones included with this project) instead of just installing, run ``git clone --recursive https://github.com/bambocher/pocketsphinx-python && cd pocketsphinx-python && python setup.py bdist_wheel`` instead of ``pip install pocketsphinx``/``python setup.py install``. The resulting Wheel will be found in the ``dist`` folder of the PocketSphinx-Python project directory. - -Notes on the structure of the language data -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -* Every language has its own folder under ``/speech_recognition/pocketsphinx-data/LANGUAGE_NAME/``, where ``LANGUAGE_NAME`` is the IETF language tag, like ``"en-US"`` (US English) or ``"en-GB"`` (UK English). - * For example, the US English data is stored in ``/speech_recognition/pocketsphinx-data/en-US/``. - * The ``language`` parameter of ``recognizer_instance.recognize_sphinx`` simply chooses the folder with the given name. -* Languages are composed of 3 parts: - * An acoustic model ``/speech_recognition/pocketsphinx-data/LANGUAGE_NAME/acoustic-model/``, which describes how to interpret audio data. - * Acoustic models can be downloaded from the `CMU Sphinx files `__. These are pretty disorganized, but instructions for cleaning up specific versions are listed below. - * All of these should be 16 kHz (broadband) models, since that's what the library will assume is being used. - * A language model ``/speech_recognition/pocketsphinx-data/LANGUAGE_NAME/language-model.lm.bin`` (in `CMU binary format `__). - * A pronounciation dictionary ``/speech_recognition/pocketsphinx-data/LANGUAGE_NAME/pronounciation-dictionary.dict``, which describes how words in the language are pronounced. - -Notes on building the language data from source -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -* All of the following points assume a Debian-derived Linux Distibution (like Ubuntu or Mint). -* To work with any complete, real-world languages, you will need quite a bit of RAM (16 GB recommended) and a fair bit of disk space (20 GB recommended). -* `SphinxBase `__ is needed for all language model file format conversions. We use it to convert between ``*.dmp`` DMP files (an obselete Sphinx binary format), ``*.lm`` ARPA files, and Sphinx binary ``*.lm.bin`` files: - * Install all the SphinxBase build dependencies with ``sudo apt-get install build-essential automake autotools-dev autoconf libtool``. - * Download and extract the `SphinxBase source code `__. - * Follow the instructions in the README to install SphinxBase. Basically, run ``sh autogen.sh --force && ./configure && make && sudo make install`` in the SphinxBase folder. -* Pruning (getting rid of less important information) is useful if language model files are too large. We can do this using `IRSTLM `__: - * Install all the IRSTLM build dependencies with ``sudo apt-get install build-essential automake autotools-dev autoconf libtool`` - * Download and extract the ``IRSTLM source code `__. - * Follow the instructions in the README to install IRSTLM. Basically, run ``sh regenerate-makefiles.sh --force && ./configure && make && sudo make install`` in the IRSTLM folder. - * If the language model is not in ARPA format, convert it to the ARPA format. To do this, ensure that SphinxBase is installed and run ``sphinx_lm_convert -i LANGUAGE_MODEL_FILE_GOES_HERE -o language-model.lm -ofmt arpa``. - * Prune the model using IRSTLM: run ``prune-lm --threshold=1e-8 t.lm pruned.lm`` to prune with a threshold of 0.00000001. The higher the threshold, the smaller the resulting file. - * Convert the model back into binary format if it was originally not in ARPA format. To do this, ensure that SphinxBase is installed and run ``sphinx_lm_convert -i language-model.lm -o LANGUAGE_MODEL_FILE_GOES_HERE``. -* US English: ``/speech_recognition/pocketsphinx-data/en-US/`` is taken directly from the contents of `PocketSphinx's US English model `__. -* Metropolitan French: ``/speech_recognition/pocketsphinx-data/fr-FR/``: - * ``/speech_recognition/pocketsphinx-data/fr-FR/language-model.lm.bin`` is ``fr-small.lm.bin`` from the `Sphinx French language model `__. - * ``/speech_recognition/pocketsphinx-data/fr-FR/pronounciation-dictionary.dict`` is ``fr.dict`` from the `Sphinx French language model `__. - * ``/speech_recognition/pocketsphinx-data/fr-FR/acoustic-model/`` is extracted from ``cmusphinx-fr-5.2.tar.gz`` in the `Sphinx French acoustic model `__. - * To get better French recognition accuracy at the expense of higher disk space and RAM usage: - 1. Download ``fr.lm.gmp`` from the `Sphinx French language model `__. - 2. Convert from DMP (an obselete Sphinx binary format) to ARPA format: ``sphinx_lm_convert -i fr.lm.gmp -o french.lm.bin``. - 3. Replace ``/speech_recognition/pocketsphinx-data/fr-FR/language-model.lm.bin`` with ``french.lm.bin`` created in the previous step. -* Mandarin Chinese: ``/speech_recognition/pocketsphinx-data/zh-CN/``: - * ``/speech_recognition/pocketsphinx-data/zh-CN/language-model.lm.bin`` is generated as follows: - 1. Download ``zh_broadcastnews_64000_utf8.DMP`` from the `Sphinx Mandarin language model `__. - 2. Convert from DMP (an obselete Sphinx binary format) to ARPA format: ``sphinx_lm_convert -i zh_broadcastnews_64000_utf8.DMP -o chinese.lm -ofmt arpa``. - 3. Prune with a threshold of 0.00000004 using ``prune-lm --threshold=4e-8 chinese.lm chinese.lm``. - 4. Convert from ARPA format to Sphinx binary format: ``sphinx_lm_convert -i chinese.lm -o chinese.lm.bin``. - 5. Replace ``/speech_recognition/pocketsphinx-data/zh-CN/language-model.lm.bin`` with ``chinese.lm.bin`` created in the previous step. - * ``/speech_recognition/pocketsphinx-data/zh-CN/pronounciation-dictionary.dict`` is ``zh_broadcastnews_utf8.dic`` from the `Sphinx Mandarin language model `__. - * ``/speech_recognition/pocketsphinx-data/zh-CN/acoustic-model/`` is extracted from ``zh_broadcastnews_16k_ptm256_8000.tar.bz2`` in the `Sphinx Mandarin acoustic model `__. - * To get better Chinese recognition accuracy at the expense of higher disk space and RAM usage, simply skip step 3 when preparing ``zh_broadcastnews_64000_utf8.DMP``. - -FLAC (for some systems) -~~~~~~~~~~~~~~~~~~~~~~~ - -A FLAC encoder is required to encode the audio data to send to the API. If using Windows, OS X, or Linux on an i385-compatible architecture, the encoder is already bundled with this library - you do not need to install anything else. - -Otherwise, ensure that you have the ``flac`` command line tool, which is often available through the system package manager. - -In summary, this library requires: - -* Python 2.6, 2.7, or 3.3+ -* PyAudio (required only if you need to use microphone input) -* PocketSphinx (required only if you need to use the Sphinx recognizer) -* FLAC encoder (required only if the system is not x86-based Windows/Linux/OS X) - -Troubleshooting ---------------- - -The ``Microphone`` class is missing/not defined! -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This class is not defined when PyAudio is not available. +The following requirements are optional, but can improve or extend functionality in some situations: -Make sure you have PyAudio installed, and make sure you can import it correctly. Test this out by opening a Python console (make sure to use the same version you're running your program with!) and typing in ``import pyaudio``. If you get an error, PyAudio is not installed or not configured correctly. +* If using CMU Sphinx, you may want to `install additional language packs `__ to support languages like International French or Mandarin Chinese. -See the "Requirements" section for more information about installing PyAudio. - -The recognizer tries to recognize speech even when I'm not speaking. -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Try increasing the ``recognizer_instance.energy_threshold`` property. This is basically how sensitive the recognizer is to when recognition should start. Higher values mean that it will be less sensitive, which is useful if you are in a loud room. - -This value depends entirely on your microphone or audio data. There is no one-size-fits-all value, but good values typically range from 50 to 4000. - -The recognizer can't recognize speech right after it starts listening for the first time. -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The following sections go over the details of each requirement. -The ``recognizer_instance.energy_threshold`` property is probably set to a value that is too high to start off with, and then being adjusted lower automatically by dynamic energy threshold adjustment. Before it is at a good level, the energy threshold is so high that speech is just considered ambient noise. - -The solution is to decrease this threshold, or call ``recognizer_instance.adjust_for_ambient_noise`` beforehand, which will set the threshold to a good value automatically. - -The recognizer doesn't understand my particular language/dialect. -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Try setting the recognition language to your language/dialect. To do this, see the documentation for ``recognizer_instance.recognize_sphinx``, ``recognizer_instance.recognize_google``, ``recognizer_instance.recognize_wit``, ``recognizer_instance.recognize_ibm``, and ``recognizer_instance.recognize_att``. - -For example, if your language/dialect is British English, it is better to use ``"en-GB"`` as the language rather than ``"en-US"``. - -The code examples throw ``UnicodeEncodeError: 'ascii' codec can't encode character`` when run. -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -When you're using Python 2, and your language uses non-ASCII characters, and the terminal or file-like object you're printing to only supports ASCII, an error is thrown when trying to write non-ASCII characters. - -This is because in Python 2, ``recognizer_instance.recognize_sphinx``, ``recognizer_instance.recognize_google``, ``recognizer_instance.recognize_wit``, ``recognizer_instance.recognize_ibm``, and ``recognizer_instance.recognize_att`` return unicode strings (``u"something"``) rather than byte strings (``"something"``). In Python 3, all strings are unicode strings. - -To make printing of unicode strings work in Python 2 as well, replace all print statements in your code of the following form: - - .. code:: python - - print SOME_UNICODE_STRING - -With the following: - - .. code:: python - - print SOME_UNICODE_STRING.encode("utf8") - -This change, however, will prevent the code from working in Python 3. - -The program doesn't run when compiled with `PyInstaller `__. -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Python +~~~~~~ -As of PyInstaller version 3.0, SpeechRecognition is supported out of the box. If you're getting weird issues when compiling your program using PyInstaller, simply update PyInstaller. +The first software requirement is `Python 3.9+ `__. This is required to use the library. -You can easily do this by running ``pip install --upgrade pyinstaller``. +PyAudio (for microphone users) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -On Ubuntu/Debian, I get errors like "jack server is not running or cannot be started" or "Cannot lock down [...] byte memory area (Cannot allocate memory)". -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`PyAudio `__ is required if and only if you want to use microphone input (``Microphone``). PyAudio version 0.2.11+ is required, as earlier versions have known memory management bugs when recording from microphones in certain situations. -The Linux audio stack is pretty fickle. There are a few things that can cause these issues. +If not installed, everything in the library will still work, except attempting to instantiate a ``Microphone`` object will raise an ``AttributeError``. -First, make sure JACK is installed - to install it, run ``sudo apt-get install multimedia-jack`` +The installation instructions on the PyAudio website are quite good - for convenience, they are summarized below: -You will then want to configure the JACK daemon correctly to avoid that "Cannot allocate memory" error. Run ``sudo dpkg-reconfigure -p high jackd2`` and select "Yes" to do so. +* On Windows, install with PyAudio using `Pip `__: execute ``pip install SpeechRecognition[audio]`` in a terminal. +* On Debian-derived Linux distributions (like Ubuntu and Mint), install PyAudio using `APT `__: execute ``sudo apt-get install python-pyaudio python3-pyaudio`` in a terminal. + * If the version in the repositories is too old, install the latest release using Pip: execute ``sudo apt-get install portaudio19-dev python-all-dev python3-all-dev && sudo pip install SpeechRecognition[audio]`` (replace ``pip`` with ``pip3`` if using Python 3). +* On OS X, install PortAudio using `Homebrew `__: ``brew install portaudio``. Then, install with PyAudio using `Pip `__: ``pip install SpeechRecognition[audio]``. +* On other POSIX-based systems, install the ``portaudio19-dev`` and ``python-all-dev`` (or ``python3-all-dev`` if using Python 3) packages (or their closest equivalents) using a package manager of your choice, and then install with PyAudio using `Pip `__: ``pip install SpeechRecognition[audio]`` (replace ``pip`` with ``pip3`` if using Python 3). -Now, you will want to make sure your current user is in the ``audio`` group. You can add your current user to this group by running ``sudo adduser $(whoami) audio``. +PocketSphinx (for Sphinx users) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Unfortunately, these changes will require you to reboot before they take effect. +`PocketSphinx `__ is **required if and only if you want to use the Sphinx recognizer** (``recognizer_instance.recognize_sphinx``). -After rebooting, run ``pulseaudio --kill``, followed by ``jack_control start``, to fix the "jack server is not running or cannot be started" error. +On Linux and other POSIX systems (such as OS X), run ``pip install SpeechRecognition[pocketsphinx]``. Follow the instructions under "Building PocketSphinx-Python from source" in `Notes on using PocketSphinx `__ for installation instructions. -On Ubuntu/Debian, I get annoying output in the terminal saying things like "bt_audio_service_open: [...] Connection refused" and various others. -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Note that the versions available in most package repositories are outdated and will not work with the bundled language data. Using the bundled wheel packages or building from source is recommended. -The "bt_audio_service_open" error means that you have a Bluetooth audio device, but as a physical device is not currently connected, we can't actually use it - if you're not using a Bluetooth microphone, then this can be safely ignored. If you are, and audio isn't working, then double check to make sure your microphone is actually connected. There does not seem to be a simple way to disable these messages. +See `Notes on using PocketSphinx `__ for information about installing languages, compiling PocketSphinx, and building language packs from online resources. This document is also included under ``reference/pocketsphinx.rst``. -For errors of the form "ALSA lib [...] Unknown PCM", see `this StackOverflow answer `__. Basically, to get rid of an error of the form "Unknown PCM cards.pcm.rear", simply comment out ``pcm.rear cards.pcm.rear`` in ``/usr/share/alsa/alsa.conf``, ``~/.asoundrc``, and ``/etc/asound.conf``. +Vosk (for Vosk users) +~~~~~~~~~~~~~~~~~~~~~ +Vosk API is **required if and only if you want to use Vosk recognizer** (``recognizer_instance.recognize_vosk``). -On OS X, I get a ``ChildProcessError`` saying that it couldn't find the system FLAC converter, even though it's installed. -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +You can install it with ``python3 -m pip install vosk``. -Installing [FLAC for OS X](https://xiph.org/flac/download.html) directly from the source code will not work, since it doesn't correctly add the executables to the search path. +You also have to install Vosk Models: -Installing FLAC using [Homebrew](http://brew.sh/) ensures that the search path is correctly updated. First, ensure you have Homebrew, then run ``brew install flac`` to install the necessary files. +`Here `__ are models avaiable for download. You have to place them in models folder of your project, like "your-project-folder/models/your-vosk-model" -Reference ---------- +Google Cloud Speech Library for Python (for Google Cloud Speech-to-Text API users) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -``Microphone(device_index = None, sample_rate = 16000, chunk_size = 1024)`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The library `google-cloud-speech `__ is **required if and only if you want to use Google Cloud Speech-to-Text API** (``recognizer_instance.recognize_google_cloud``). +You can install it with ``python3 -m pip install SpeechRecognition[google-cloud]``. +(ref: `official installation instructions `__) -This is available if PyAudio is available, and is undefined otherwise. +**Prerequisite**: Create local authentication credentials for your Google account -Creates a new ``Microphone`` instance, which represents a physical microphone on the computer. Subclass of ``AudioSource``. +* Digest: `Before you begin (Transcribe speech to text by using client libraries) `__ +* `Set up Speech-to-Text `__ +* `User credentials (Set up ADC for a local development environment) `__ -If ``device_index`` is unspecified or ``None``, the default microphone is used as the audio source. Otherwise, ``device_index`` should be the index of the device to use for audio input. +Currently only `V1 `__ is supported. (`V2 `__ is not supported) -A device index is an integer between 0 and ``pyaudio.get_device_count() - 1`` (assume we have used ``import pyaudio`` beforehand) inclusive. It represents an audio device such as a microphone or speaker. See the `PyAudio documentation `__ for more details. +FLAC (for some systems) +~~~~~~~~~~~~~~~~~~~~~~~ -The microphone audio is recorded in chunks of ``chunk_size`` samples, at a rate of ``sample_rate`` samples per second (Hertz). +A `FLAC encoder `__ is required to encode the audio data to send to the API. If using Windows (x86 or x86-64), OS X (Intel Macs only, OS X 10.6 or higher), or Linux (x86 or x86-64), this is **already bundled with this library - you do not need to install anything**. -Higher ``sample_rate`` values result in better audio quality, but also more bandwidth (and therefore, slower recognition). Additionally, some machines, such as some Raspberry Pi models, can't keep up if this value is too high. +Otherwise, ensure that you have the ``flac`` command line tool, which is often available through the system package manager. For example, this would usually be ``sudo apt-get install flac`` on Debian-derivatives, or ``brew install flac`` on OS X with Homebrew. -Higher ``chunk_size`` values help avoid triggering on rapidly changing ambient noise, but also makes detection less sensitive. This value, generally, should be left at its default. +Whisper (for Whisper users) +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Whisper is **required if and only if you want to use whisper** (``recognizer_instance.recognize_whisper``). -Instances of this class are context managers, and are designed to be used with ``with`` statements: +You can install it with ``python3 -m pip install SpeechRecognition[whisper-local]``. -.. code:: python +Faster Whisper (for Faster Whisper users) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - with Microphone() as source: # open the microphone and start recording - pass # do things here - ``source`` is the Microphone instance created above - # the microphone is automatically released at this point +The library `faster-whisper `__ is **required if and only if you want to use Faster Whisper** (``recognizer_instance.recognize_faster_whisper``). -``Microphone.list_microphone_names()`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +You can install it with ``python3 -m pip install SpeechRecognition[faster-whisper]``. -Returns a list of the names of all available microphones. For microphones where the name can't be retrieved, the list entry contains ``None`` instead. +OpenAI Whisper API (for OpenAI Whisper API users) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The index of each microphone's name is the same as its device index when creating a ``Microphone`` instance - indices in this list can be used as values of ``device_index``. +The library `openai `__ is **required if and only if you want to use OpenAI Whisper API** (``recognizer_instance.recognize_openai``). -To create a ``Microphone`` instance by name: +You can install it with ``python3 -m pip install SpeechRecognition[openai]``. -.. code:: python +Please set the environment variable ``OPENAI_API_KEY`` before calling ``recognizer_instance.recognize_openai``. - m = None - for microphone_name in Microphone.list_microphone_names(): - if microphone_name == "HDA Intel HDMI: 0 (hw:0,3)": - m = Microphone(i) +Groq Whisper API (for Groq Whisper API users) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -``WavFile(filename_or_fileobject)`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The library `groq `__ is **required if and only if you want to use Groq Whisper API** (``recognizer_instance.recognize_groq``). -Creates a new ``WavFile`` instance given a WAV audio file ``filename_or_fileobject``. Subclass of ``AudioSource``. +You can install it with ``python3 -m pip install SpeechRecognition[groq]``. -If ``filename_or_fileobject`` is a string, then it is interpreted as a path to a WAV audio file (mono or stereo) on the filesystem. Otherwise, ``filename_or_fileobject`` should be a file-like object such as ``io.BytesIO`` or similar. +Please set the environment variable ``GROQ_API_KEY`` before calling ``recognizer_instance.recognize_groq``. -Note that the WAV file must be in PCM/LPCM format; WAVE_FORMAT_EXTENSIBLE and compressed WAV are not supported and may result in undefined behaviour. +Troubleshooting +--------------- -Instances of this class are context managers, and are designed to be used with ``with`` statements: +The recognizer tries to recognize speech even when I'm not speaking, or after I'm done speaking. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. code:: python +Try increasing the ``recognizer_instance.energy_threshold`` property. This is basically how sensitive the recognizer is to when recognition should start. Higher values mean that it will be less sensitive, which is useful if you are in a loud room. - import speech_recognition as sr - with sr.WavFile("SOMETHING.wav") as source: # open the WAV file for reading - pass # do things here - ``source`` is the WavFile instance created above +This value depends entirely on your microphone or audio data. There is no one-size-fits-all value, but good values typically range from 50 to 4000. -``wavfile_instance.DURATION`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Also, check on your microphone volume settings. If it is too sensitive, the microphone may be picking up a lot of ambient noise. If it is too insensitive, the microphone may be rejecting speech as just noise. -Represents the length of the audio stored in the WAV file in seconds. This property is only available when inside a context - essentially, that means it should only be accessed inside a ``with wavfile_instance ...`` statement. Outside of contexts, this property is ``None``. +The recognizer can't recognize speech right after it starts listening for the first time. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -This is useful when combined with the ``offset`` parameter of ``recognizer_instance.record``, since when together it is possible to perform speech recognition in chunks. +The ``recognizer_instance.energy_threshold`` property is probably set to a value that is too high to start off with, and then being adjusted lower automatically by dynamic energy threshold adjustment. Before it is at a good level, the energy threshold is so high that speech is just considered ambient noise. -However, note that recognizing speech in multiple chunks is not the same as recognizing the whole thing at once. If spoken words appear on the boundaries that we split the audio into chunks on, each chunk only gets part of the word, which may result in inaccurate results. +The solution is to decrease this threshold, or call ``recognizer_instance.adjust_for_ambient_noise`` beforehand, which will set the threshold to a good value automatically. -``Recognizer()`` -~~~~~~~~~~~~~~~~ +The recognizer doesn't understand my particular language/dialect. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Creates a new ``Recognizer`` instance, which represents a collection of speech recognition settings and functionality. +Try setting the recognition language to your language/dialect. To do this, see the documentation for ``recognizer_instance.recognize_sphinx``, ``recognizer_instance.recognize_google``, ``recognizer_instance.recognize_wit``, ``recognizer_instance.recognize_bing``, ``recognizer_instance.recognize_api``, ``recognizer_instance.recognize_houndify``, and ``recognizer_instance.recognize_ibm``. -``recognizer_instance.energy_threshold = 300`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +For example, if your language/dialect is British English, it is better to use ``"en-GB"`` as the language rather than ``"en-US"``. -Represents the energy level threshold for sounds. Values below this threshold are considered silence, and values above this threshold are considered speech. Can be changed. +The recognizer hangs on ``recognizer_instance.listen``; specifically, when it's calling ``Microphone.MicrophoneStream.read``. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -This is adjusted automatically if dynamic thresholds are enabled (see ``recognizer_instance.dynamic_energy_threshold``). A good starting value will generally allow the automatic adjustment to reach a good value faster. +This usually happens when you're using a Raspberry Pi board, which doesn't have audio input capabilities by itself. This causes the default microphone used by PyAudio to simply block when we try to read it. If you happen to be using a Raspberry Pi, you'll need a USB sound card (or USB microphone). -This threshold is associated with the perceived loudness of the sound, but it is a nonlinear relationship. The actual energy threshold you will need depends on your microphone sensitivity or audio data. Typical values for a silent room are 0 to 100, and typical values for speaking are between 150 and 3500. Ambient (non-speaking) noise has a significant impact on what values will work best. +Once you do this, change all instances of ``Microphone()`` to ``Microphone(device_index=MICROPHONE_INDEX)``, where ``MICROPHONE_INDEX`` is the hardware-specific index of the microphone. -If you're having trouble with the recognizer trying to recognize words even when you're not speaking, try tweaking this to a higher value. If you're having trouble with the recognizer not recognizing your words when you are speaking, try tweaking this to a lower value. For example, a sensitive microphone or microphones in louder rooms might have a ambient energy level of up to 4000: +To figure out what the value of ``MICROPHONE_INDEX`` should be, run the following code: .. code:: python import speech_recognition as sr - r = sr.Recognizer() - r.energy_threshold = 4000 - # rest of your code goes here - -The dynamic energy threshold setting can mitigate this by increasing or decreasing this automatically to account for ambient noise. However, this takes time to adjust, so it is still possible to get the false positive detections before the threshold settles into a good value. - -To avoid this, use ``recognizer_instance.adjust_for_ambient_noise(source, duration = 1)`` to calibrate the level to a good value. Alternatively, simply set this property to a high value initially (4000 works well), so the threshold is always above ambient noise levels: over time, it will be automatically decreased to account for ambient noise levels. - -``recognizer_instance.dynamic_energy_threshold = True`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Represents whether the energy level threshold (see ``recognizer_instance.energy_threshold``) for sounds should be automatically adjusted based on the currently ambient noise level while listening. Can be changed. - -Recommended for situations where the ambient noise level is unpredictable, which seems to be the majority of use cases. If the ambient noise level is strictly controlled, better results might be achieved by setting this to ``False`` to turn it off. - -``recognizer_instance.dynamic_energy_adjustment_damping = 0.15`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If the dynamic energy threshold setting is enabled (see ``recognizer_instance.dynamic_energy_threshold``), represents approximately the fraction of the current energy threshold that is retained after one second of dynamic threshold adjustment. Can be changed (not recommended). - -Lower values allow for faster adjustment, but also make it more likely to miss certain phrases (especially those with slowly changing volume). This value should be between 0 and 1. As this value approaches 1, dynamic adjustment has less of an effect over time. When this value is 1, dynamic adjustment has no effect. - -``recognizer_instance.dynamic_energy_adjustment_ratio = 1.5`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If the dynamic energy threshold setting is enabled (see ``recognizer_instance.dynamic_energy_threshold``), represents the minimum factor by which speech is louder than ambient noise. Can be changed (not recommended). - -For example, the default value of 1.5 means that speech is at least 1.5 times louder than ambient noise. Smaller values result in more false positives (but fewer false negatives) when ambient noise is loud compared to speech. - -``recognizer_instance.pause_threshold = 0.8`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Represents the minimum length of silence (in seconds) that will register as the end of a phrase. Can be changed. - -Smaller values result in the recognition completing more quickly, but might result in slower speakers being cut off. - -``recognizer_instance.record(source, duration = None, offset = None)`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Records up to ``duration`` seconds of audio from ``source`` (an ``AudioSource`` instance) starting at ``offset`` (or at the beginning if not specified) into an ``AudioData`` instance, which it returns. - -If ``duration`` is not specified, then it will record until there is no more audio input. - -``recognizer_instance.adjust_for_ambient_noise(source, duration = 1)`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Adjusts the energy threshold dynamically using audio from ``source`` (an ``AudioSource`` instance) to account for ambient noise. - -Intended to calibrate the energy threshold with the ambient energy level. Should be used on periods of audio without speech - will stop early if any speech is detected. - -The ``duration`` parameter is the maximum number of seconds that it will dynamically adjust the threshold for before returning. This value should be at least 0.5 in order to get a representative sample of the ambient noise. - -``recognizer_instance.listen(source, timeout = None)`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Records a single phrase from ``source`` (an ``AudioSource`` instance) into an ``AudioData`` instance, which it returns. - -This is done by waiting until the audio has an energy above ``recognizer_instance.energy_threshold`` (the user has started speaking), and then recording until it encounters ``recognizer_instance.pause_threshold`` seconds of non-speaking or there is no more audio input. The ending silence is not included. - -The ``timeout`` parameter is the maximum number of seconds that it will wait for a phrase to start before giving up and throwing an ``speech_recognition.WaitTimeoutError`` exception. If ``timeout`` is ``None``, it will wait indefinitely. - -``recognizer_instance.listen_in_background(source, callback)`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Spawns a thread to repeatedly record phrases from ``source`` (an ``AudioSource`` instance) into an ``AudioData`` instance and call ``callback`` with that ``AudioData`` instance as soon as each phrase are detected. - -Returns a function object that, when called, requests that the background listener thread stop, and waits until it does before returning. The background thread is a daemon and will not stop the program from exiting if there are no other non-daemon threads. - -Phrase recognition uses the exact same mechanism as ``recognizer_instance.listen(source)``. - -The ``callback`` parameter is a function that should accept two parameters - the ``recognizer_instance``, and an ``AudioData`` instance representing the captured audio. Note that ``callback`` function will be called from a non-main thread. + for index, name in enumerate(sr.Microphone.list_microphone_names()): + print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) -``recognizer_instance.recognize_sphinx(audio_data, language = "en-US", show_all = False)`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +This will print out something like the following: -Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx. - -The recognition language is determined by ``language``, an IETF language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. Out of the box, only ``en-US`` is supported. See the "Installing other languages" section in the README for information about additional language packs. - -Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Hypothesis`` object generated by Sphinx. - -Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation. - -``recognizer_instance.recognize_google(audio_data, key = None, language = "en-US", show_all = False)`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Speech Recognition API. - -The Google Speech Recognition API key is specified by ``key``. If not specified, it uses a generic key that works out of the box. This should generally be used for personal or testing purposes only, as it **may be revoked by Google at any time**. - -To obtain your own API key, simply follow the steps on the `API Keys `__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API". Note that **the API quota for your own keys is 50 requests per day**, and there is currently no way to raise this limit. - -The recognition language is determined by ``language``, an IETF language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. A list of supported language codes can be found `here `__. Basically, language codes can be just the language (``en``), or a language with a dialect (``en-US``). - -Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary. - -Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the key isn't valid, the quota for the key is maxed out, or there is no internet connection. - -``recognizer_instance.recognize_wit(audio_data, key, show_all = False)`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Wit.ai API. - -The Wit.ai API key is specified by ``key``. Unfortunately, these are not available without `signing up for an account `__ and creating an app. You will need to add at least one intent (recognizable sentence) before the API key can be accessed, though the actual intent values don't matter. - -To get the API key for a Wit.ai app, go to the app settings, go to the section titled "API Details", and look for "Server Access Token" or "Client Access Token". If the desired field is blank, click on the "Reset token" button on the right of the field. Wit.ai API keys are 32-character uppercase alphanumeric strings. - -Though Wit.ai is designed to be used with a fixed set of phrases, it still provides services for general-purpose speech recognition. - -The recognition language is configured in the Wit.ai app settings. +:: -Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response `__ as a JSON dictionary. + Microphone with name "HDA Intel HDMI: 0 (hw:0,3)" found for `Microphone(device_index=0)` + Microphone with name "HDA Intel HDMI: 1 (hw:0,7)" found for `Microphone(device_index=1)` + Microphone with name "HDA Intel HDMI: 2 (hw:0,8)" found for `Microphone(device_index=2)` + Microphone with name "Blue Snowball: USB Audio (hw:1,0)" found for `Microphone(device_index=3)` + Microphone with name "hdmi" found for `Microphone(device_index=4)` + Microphone with name "pulse" found for `Microphone(device_index=5)` + Microphone with name "default" found for `Microphone(device_index=6)` -Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the key isn't valid, the quota for the key is maxed out, or there is no internet connection. +Now, to use the Snowball microphone, you would change ``Microphone()`` to ``Microphone(device_index=3)``. -``recognizer_instance.recognize_ibm(audio_data, username, password, language = "en-US", show_all = False)`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Calling ``Microphone()`` gives the error ``IOError: No Default Input Device Available``. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the IBM Speech to Text API. +As the error says, the program doesn't know which microphone to use. -The IBM Speech to Text username and password are specified by ``username`` and ``password``, respectively. Unfortunately, these are not available without an account. IBM has published instructions for obtaining these credentials in the `IBM Watson Developer Cloud documentation `__. +To proceed, either use ``Microphone(device_index=MICROPHONE_INDEX, ...)`` instead of ``Microphone(...)``, or set a default microphone in your OS. You can obtain possible values of ``MICROPHONE_INDEX`` using the code in the troubleshooting entry right above this one. -The recognition language is determined by ``language``, an IETF language tag with a dialect like ``"en-US"`` or ``"es-ES"``, defaulting to US English. At the moment, this supports the tags ``"en-US"`` and ``"es-ES"``. +The program doesn't run when compiled with `PyInstaller `__. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response `__ as a JSON dictionary. +As of PyInstaller version 3.0, SpeechRecognition is supported out of the box. If you're getting weird issues when compiling your program using PyInstaller, simply update PyInstaller. -Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if an error occurred, such as an invalid key, or a broken internet connection. +You can easily do this by running ``pip install --upgrade pyinstaller``. -``recognizer_instance.recognize_att(audio_data, app_key, app_secret, language = "en-US", show_all = False)`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +On Ubuntu/Debian, I get annoying output in the terminal saying things like "bt_audio_service_open: [...] Connection refused" and various others. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the AT&T Speech to Text API. +The "bt_audio_service_open" error means that you have a Bluetooth audio device, but as a physical device is not currently connected, we can't actually use it - if you're not using a Bluetooth microphone, then this can be safely ignored. If you are, and audio isn't working, then double check to make sure your microphone is actually connected. There does not seem to be a simple way to disable these messages. -The AT&T Speech to Text app key and app secret are specified by ``app_key`` and ``app_secret``, respectively. Unfortunately, these are not available without `signing up for an account `__ and creating an app. +For errors of the form "ALSA lib [...] Unknown PCM", see `this StackOverflow answer `__. Basically, to get rid of an error of the form "Unknown PCM cards.pcm.rear", simply comment out ``pcm.rear cards.pcm.rear`` in ``/usr/share/alsa/alsa.conf``, ``~/.asoundrc``, and ``/etc/asound.conf``. -To get the app key and app secret for an AT&T app, go to the `My Apps page `__ and look for "APP KEY" and "APP SECRET". AT&T app keys and app secrets are 32-character lowercase alphanumeric strings. +For "jack server is not running or cannot be started" or "connect(2) call to /dev/shm/jack-1000/default/jack_0 failed (err=No such file or directory)" or "attempt to connect to server failed", these are caused by ALSA trying to connect to JACK, and can be safely ignored. I'm not aware of any simple way to turn those messages off at this time, besides `entirely disabling printing while starting the microphone `__. -The recognition language is determined by ``language``, an IETF language tag with a dialect like ``"en-US"`` or ``"es-ES"``, defaulting to US English. At the moment, this supports the tags ``"en-US"`` and ``"es-ES"``. +On OS X, I get a ``ChildProcessError`` saying that it couldn't find the system FLAC converter, even though it's installed. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response `__ as a JSON dictionary. +Installing `FLAC for OS X `__ directly from the source code will not work, since it doesn't correctly add the executables to the search path. -Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the key isn't valid, or there is no internet connection. +Installing FLAC using `Homebrew `__ ensures that the search path is correctly updated. First, ensure you have Homebrew, then run ``brew install flac`` to install the necessary files. -``AudioSource`` -~~~~~~~~~~~~~~~ +Developing +---------- -Base class representing audio sources. Do not instantiate. +To hack on this library, first make sure you have all the requirements listed in the "Requirements" section. -Instances of subclasses of this class, such as ``Microphone`` and ``WavFile``, can be passed to things like ``recognizer_instance.record`` and ``recognizer_instance.listen``. +- Most of the library code lives in ``speech_recognition/__init__.py``. +- Examples live under the ``examples/`` `directory `__, and the demo script lives in ``speech_recognition/__main__.py``. +- The FLAC encoder binaries are in the ``speech_recognition/`` `directory `__. +- Documentation can be found in the ``reference/`` `directory `__. +- Third-party libraries, utilities, and reference material are in the ``third-party/`` `directory `__. -``AudioData`` -~~~~~~~~~~~~~ +To install/reinstall the library locally, run ``python -m pip install -e .[dev]`` in the project `root directory `__. -Storage class for audio data. Do not instantiate. +Before a release, the version number is bumped in ``README.rst`` and ``speech_recognition/__init__.py``. Version tags are then created using ``git config gpg.program gpg2 && git config user.signingkey DB45F6C431DE7C2DCD99FF7904882258A4063489 && git tag -s VERSION_GOES_HERE -m "Version VERSION_GOES_HERE"``. -Instances of this class are returned from ``recognizer_instance.record`` and ``recognizer_instance.listen``, and are passed to callbacks of ``recognizer_instance.listen_in_background``. +Releases are done by running ``make-release.sh VERSION_GOES_HERE`` to build the Python source packages, sign them, and upload them to PyPI. -``audiodata_instance.get_raw_data(convert_rate = None, convert_width = None)`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Testing +~~~~~~~ -Returns a byte string representing the raw frame data for the audio represented by the ``AudioData`` instance. +Prerequisite: `Install pipx `__. -If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. +To run all the tests: -If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. +.. code:: bash -Writing these bytes directly to a file results in a valid `RAW/PCM audio file `__. + python -m unittest discover --verbose -``audiodata_instance.get_wav_data(convert_rate = None, convert_width = None)`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +To run static analysis: -Returns a byte string representing the contents of a WAV file containing the audio represented by the ``AudioData`` instance. +.. code:: bash -If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. + make lint -If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. +To ensure RST is well-formed: -Writing these bytes directly to a file results in a valid `WAV file `__. +.. code:: bash -``audiodata_instance.get_flac_data(convert_rate = None, convert_width = None)`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + make rstcheck -Returns a byte string representing the contents of a FLAC file containing the audio represented by the ``AudioData`` instance. +Testing is also done automatically by GitHub Actions, upon every push. -If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. +FLAC Executables +~~~~~~~~~~~~~~~~ -If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. +The included ``flac-win32`` executable is the `official FLAC 1.3.2 32-bit Windows binary `__. -Writing these bytes directly to a file results in a valid `FLAC file `__. +The included ``flac-linux-x86`` and ``flac-linux-x86_64`` executables are built from the `FLAC 1.3.2 source code `__ with `Manylinux `__ to ensure that it's compatible with a wide variety of distributions. -Developing ----------- +The built FLAC executables should be bit-for-bit reproducible. To rebuild them, run the following inside the project directory on a Debian-like system: -To hack on this library, first make sure you have all the requirements listed in the "Requirements" section. +.. code:: bash -- Most of the library code lives in ``speech_recognition/__init__.py``. -- Examples live under the ``examples/`` directory, and the demo script lives in ``speech_recognition/__main__.py``. -- The FLAC encoder binaries are in the ``speech_recognition/`` directory. + # download and extract the FLAC source code + cd third-party + sudo apt-get install --yes docker.io -To install/reinstall the library locally, run ``python setup.py install`` in the project root directory. + # build FLAC inside the Manylinux i686 Docker image + tar xf flac-1.3.2.tar.xz + sudo docker run --tty --interactive --rm --volume "$(pwd):/root" quay.io/pypa/manylinux1_i686:latest bash + cd /root/flac-1.3.2 + ./configure LDFLAGS=-static # compiler flags to make a static build + make + exit + cp flac-1.3.2/src/flac/flac ../speech_recognition/flac-linux-x86 && sudo rm -rf flac-1.3.2/ -Releases are done by running either ``build.sh`` or ``build.bat``. These are bash and batch scripts, respectively, that build Python source packages and `Python Wheels `__, then upload them to PyPI. + # build FLAC inside the Manylinux x86_64 Docker image + tar xf flac-1.3.2.tar.xz + sudo docker run --tty --interactive --rm --volume "$(pwd):/root" quay.io/pypa/manylinux1_x86_64:latest bash + cd /root/flac-1.3.2 + ./configure LDFLAGS=-static # compiler flags to make a static build + make + exit + cp flac-1.3.2/src/flac/flac ../speech_recognition/flac-linux-x86_64 && sudo rm -r flac-1.3.2/ -Features and bugfixes should be tested, at minimum, on Python 2.7 and a recent version of Python 3. It is highly recommended to test features on Python 2.6, 2.7, 3.3, and the latest version of Python 3. +The included ``flac-mac`` executable is extracted from `xACT 2.39 `__, which is a frontend for FLAC 1.3.2 that conveniently includes binaries for all of its encoders. Specifically, it is a copy of ``xACT 2.39/xACT.app/Contents/Resources/flac`` in ``xACT2.39.zip``. Authors ------- :: - Uberi (Anthony Zhang) + Uberi (Anthony Zhang) bobsayshilol arvindch (Arvind Chembarpu) kevinismith (Kevin Smith) haas85 DelightRun maverickagm + kamushadenes (Kamus Hadenes) + sbraden (Sarah Braden) + tb0hdan (Bohdan Turkynewych) + Thynix (Steve Dougherty) + beeedy (Broderick Carlin) Please report bugs and suggestions at the `issue tracker `__! +How to cite this library (APA style): + + Zhang, A. (2017). Speech Recognition (Version 3.11) [Software]. Available from https://github.com/Uberi/speech_recognition#readme. + +How to cite this library (Chicago style): + + Zhang, Anthony. 2017. *Speech Recognition* (version 3.11). + +Also check out the `Python Baidu Yuyin API `__, which is based on an older version of this project, and adds support for `Baidu Yuyin `__. Note that Baidu Yuyin is only available inside China. + License ------- -Copyright 2014-2016 `Anthony Zhang (Uberi) `__. +Copyright 2014- `Anthony Zhang (Uberi) `__. The source code for this library is available online at `GitHub `__. -The source code is available online at `GitHub `__. +SpeechRecognition is made available under the 3-clause BSD license. See ``LICENSE.txt`` in the project's `root directory `__ for more information. -This program is made available under the 3-clause BSD license. See ``LICENSE.txt`` in the project's root directory for more information. +For convenience, all the official distributions of SpeechRecognition already include a copy of the necessary copyright notices and licenses. In your project, you can simply **say that licensing information for SpeechRecognition can be found within the SpeechRecognition README, and make sure SpeechRecognition is visible to users if they wish to see it**. -This program distributes source code, binaries, and language files from `CMU Sphinx `__. These files are BSD-licensed and redistributable as long as copyright notices are correctly retained. See ``speech_recognition/pocketsphinx-data/*/LICENSE*.txt`` and ``third-party/LICENSE-Sphinx.txt`` for details concerning individual files. +SpeechRecognition distributes language files from `CMU Sphinx `__. These files are BSD-licensed and redistributable as long as copyright notices are correctly retained. See ``speech_recognition/pocketsphinx-data/*/LICENSE*.txt`` for license details for individual parts. -This program distributes source code and binaries from `PyAudio `__. These files are MIT-licensed and redistributable as long as copyright notices are correctly retained. See license files inside ``third-party/LICENSE-PyAudio.txt`` for details concerning individual files. +SpeechRecognition distributes binaries from `FLAC `__ - ``speech_recognition/flac-win32.exe``, ``speech_recognition/flac-linux-x86``, and ``speech_recognition/flac-mac``. These files are GPLv2-licensed and redistributable, as long as the terms of the GPL are satisfied. The FLAC binaries are an `aggregate `__ of `separate programs `__, so these GPL restrictions do not apply to the library or your programs that use the library, only to FLAC itself. See ``LICENSE-FLAC.txt`` for license details. diff --git a/build.bat b/build.bat deleted file mode 100644 index 5879ec3e..00000000 --- a/build.bat +++ /dev/null @@ -1,5 +0,0 @@ -python setup.py sdist -python setup.py bdist_wheel - -echo "if the following doesn't work, make sure you have your account set up properly with `python setup.py register`" -python setup.py sdist bdist_wheel upload diff --git a/build.sh b/build.sh deleted file mode 100644 index 29a0dc02..00000000 --- a/build.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash - -python3 setup.py sdist -python3 setup.py bdist_wheel - -echo "if the following doesn't work, make sure you have your account set up properly with `python3 setup.py register`" -python3 setup.py sdist bdist_wheel upload diff --git a/examples/audio_transcribe.py b/examples/audio_transcribe.py new file mode 100644 index 00000000..ec14614d --- /dev/null +++ b/examples/audio_transcribe.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 + +import speech_recognition as sr + +# obtain path to "english.wav" in the same folder as this script +from os import path +AUDIO_FILE = path.join(path.dirname(path.realpath(__file__)), "english.wav") +# AUDIO_FILE = path.join(path.dirname(path.realpath(__file__)), "french.aiff") +# AUDIO_FILE = path.join(path.dirname(path.realpath(__file__)), "chinese.flac") + +# use the audio file as the audio source +r = sr.Recognizer() +with sr.AudioFile(AUDIO_FILE) as source: + audio = r.record(source) # read the entire audio file + +# recognize speech using Sphinx +try: + print("Sphinx thinks you said " + r.recognize_sphinx(audio)) +except sr.UnknownValueError: + print("Sphinx could not understand audio") +except sr.RequestError as e: + print("Sphinx error; {0}".format(e)) + +# recognize speech using Google Speech Recognition +try: + # for testing purposes, we're just using the default API key + # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")` + # instead of `r.recognize_google(audio)` + print("Google Speech Recognition thinks you said " + r.recognize_google(audio)) +except sr.UnknownValueError: + print("Google Speech Recognition could not understand audio") +except sr.RequestError as e: + print("Could not request results from Google Speech Recognition service; {0}".format(e)) + +# recognize speech using Google Cloud Speech +# Before run, create local authentication credentials (``gcloud auth application-default login``) +try: + print("Google Cloud Speech thinks you said " + r.recognize_google_cloud(audio)) +except sr.UnknownValueError: + print("Google Cloud Speech could not understand audio") +except sr.RequestError as e: + print("Could not request results from Google Cloud Speech service; {0}".format(e)) + +# recognize speech using Wit.ai +WIT_AI_KEY = "INSERT WIT.AI API KEY HERE" # Wit.ai keys are 32-character uppercase alphanumeric strings +try: + print("Wit.ai thinks you said " + r.recognize_wit(audio, key=WIT_AI_KEY)) +except sr.UnknownValueError: + print("Wit.ai could not understand audio") +except sr.RequestError as e: + print("Could not request results from Wit.ai service; {0}".format(e)) + +# recognize speech using Microsoft Azure Speech +AZURE_SPEECH_KEY = "INSERT AZURE SPEECH API KEY HERE" # Microsoft Speech API keys 32-character lowercase hexadecimal strings +try: + print("Microsoft Azure Speech thinks you said " + r.recognize_azure(audio, key=AZURE_SPEECH_KEY)) +except sr.UnknownValueError: + print("Microsoft Azure Speech could not understand audio") +except sr.RequestError as e: + print("Could not request results from Microsoft Azure Speech service; {0}".format(e)) + +# recognize speech using Microsoft Bing Voice Recognition +BING_KEY = "INSERT BING API KEY HERE" # Microsoft Bing Voice Recognition API keys 32-character lowercase hexadecimal strings +try: + print("Microsoft Bing Voice Recognition thinks you said " + r.recognize_bing(audio, key=BING_KEY)) +except sr.UnknownValueError: + print("Microsoft Bing Voice Recognition could not understand audio") +except sr.RequestError as e: + print("Could not request results from Microsoft Bing Voice Recognition service; {0}".format(e)) + +# recognize speech using Houndify +HOUNDIFY_CLIENT_ID = "INSERT HOUNDIFY CLIENT ID HERE" # Houndify client IDs are Base64-encoded strings +HOUNDIFY_CLIENT_KEY = "INSERT HOUNDIFY CLIENT KEY HERE" # Houndify client keys are Base64-encoded strings +try: + print("Houndify thinks you said " + r.recognize_houndify(audio, client_id=HOUNDIFY_CLIENT_ID, client_key=HOUNDIFY_CLIENT_KEY)) +except sr.UnknownValueError: + print("Houndify could not understand audio") +except sr.RequestError as e: + print("Could not request results from Houndify service; {0}".format(e)) + +# recognize speech using IBM Speech to Text +IBM_USERNAME = "INSERT IBM SPEECH TO TEXT USERNAME HERE" # IBM Speech to Text usernames are strings of the form XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX +IBM_PASSWORD = "INSERT IBM SPEECH TO TEXT PASSWORD HERE" # IBM Speech to Text passwords are mixed-case alphanumeric strings +try: + print("IBM Speech to Text thinks you said " + r.recognize_ibm(audio, username=IBM_USERNAME, password=IBM_PASSWORD)) +except sr.UnknownValueError: + print("IBM Speech to Text could not understand audio") +except sr.RequestError as e: + print("Could not request results from IBM Speech to Text service; {0}".format(e)) diff --git a/examples/background_listening.py b/examples/background_listening.py index 7800005f..5e96b9ba 100644 --- a/examples/background_listening.py +++ b/examples/background_listening.py @@ -2,8 +2,11 @@ # NOTE: this example requires PyAudio because it uses the Microphone class +import time + import speech_recognition as sr + # this is called from the background thread def callback(recognizer, audio): # received audio data, now we'll recognize it using Google Speech Recognition @@ -17,17 +20,21 @@ def callback(recognizer, audio): except sr.RequestError as e: print("Could not request results from Google Speech Recognition service; {0}".format(e)) + r = sr.Recognizer() m = sr.Microphone() with m as source: - r.adjust_for_ambient_noise(source) # we only need to calibrate once, before we start listening + r.adjust_for_ambient_noise(source) # we only need to calibrate once, before we start listening # start listening in the background (note that we don't have to do this inside a `with` statement) stop_listening = r.listen_in_background(m, callback) # `stop_listening` is now a function that, when called, stops background listening -# do some other computation for 5 seconds, then stop listening and keep doing other computations -import time -for _ in range(50): time.sleep(0.1) # we're still listening even though the main thread is doing other things -stop_listening() # calling this function requests that the background listener stop listening -while True: time.sleep(0.1) +# do some unrelated computations for 5 seconds +for _ in range(50): time.sleep(0.1) # we're still listening even though the main thread is doing other things + +# calling this function requests that the background listener stop listening +stop_listening(wait_for_stop=False) + +# do some more unrelated things +while True: time.sleep(0.1) # we're not listening anymore, even though the background thread might still be running for a second or two while cleaning up and stopping diff --git a/examples/calibrate_energy_threshold.py b/examples/calibrate_energy_threshold.py index ae1c59f3..416d8e81 100644 --- a/examples/calibrate_energy_threshold.py +++ b/examples/calibrate_energy_threshold.py @@ -7,7 +7,7 @@ # obtain audio from the microphone r = sr.Recognizer() with sr.Microphone() as source: - r.adjust_for_ambient_noise(source) # listen for 1 second to calibrate the energy threshold for ambient noise levels + r.adjust_for_ambient_noise(source) # listen for 1 second to calibrate the energy threshold for ambient noise levels print("Say something!") audio = r.listen(source) diff --git a/examples/chinese.flac b/examples/chinese.flac new file mode 100644 index 00000000..f74764fd Binary files /dev/null and b/examples/chinese.flac differ diff --git a/examples/chinese.wav b/examples/chinese.wav deleted file mode 100644 index a05a1a66..00000000 Binary files a/examples/chinese.wav and /dev/null differ diff --git a/examples/counting.gram b/examples/counting.gram new file mode 100644 index 00000000..86ff6a28 --- /dev/null +++ b/examples/counting.gram @@ -0,0 +1,11 @@ +#JSGF V1.0; + +/** + * JSGF Grammar for English counting example + */ + +grammar counting; + +public = ( ) +; + + = one | two | three | four | five | six | seven ; diff --git a/examples/english.wav b/examples/english.wav index 548973c6..40d7eb5c 100644 Binary files a/examples/english.wav and b/examples/english.wav differ diff --git a/examples/extended_results.py b/examples/extended_results.py index bf54cab6..f65061ed 100644 --- a/examples/extended_results.py +++ b/examples/extended_results.py @@ -1,15 +1,20 @@ #!/usr/bin/env python3 +from pprint import pprint + import speech_recognition as sr # obtain path to "english.wav" in the same folder as this script from os import path -WAV_FILE = path.join(path.dirname(path.realpath(__file__)), "english.wav") -# use "english.wav" as the audio source +AUDIO_FILE = path.join(path.dirname(path.realpath(__file__)), "english.wav") +# AUDIO_FILE = path.join(path.dirname(path.realpath(__file__)), "french.aiff") +# AUDIO_FILE = path.join(path.dirname(path.realpath(__file__)), "chinese.flac") + +# use the audio file as the audio source r = sr.Recognizer() -with sr.WavFile(WAV_FILE) as source: - audio = r.record(source) # read the entire WAV file +with sr.AudioFile(AUDIO_FILE) as source: + audio = r.record(source) # read the entire audio file # recognize speech using Sphinx try: @@ -24,43 +29,61 @@ # for testing purposes, we're just using the default API key # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY", show_all=True)` # instead of `r.recognize_google(audio, show_all=True)` - from pprint import pprint print("Google Speech Recognition results:") - pprint(r.recognize_google(audio, show_all=True)) # pretty-print the recognition result + pprint(r.recognize_google(audio, show_all=True)) # pretty-print the recognition result except sr.UnknownValueError: print("Google Speech Recognition could not understand audio") except sr.RequestError as e: print("Could not request results from Google Speech Recognition service; {0}".format(e)) +# recognize speech using Google Cloud Speech +# Before run, create local authentication credentials (``gcloud auth application-default login``) +try: + print("Google Cloud Speech recognition results:") + pprint(r.recognize_google_cloud(audio, show_all=True)) # pretty-print the recognition result +except sr.UnknownValueError: + print("Google Cloud Speech could not understand audio") +except sr.RequestError as e: + print("Could not request results from Google Cloud Speech service; {0}".format(e)) + # recognize speech using Wit.ai -WIT_AI_KEY = "INSERT WIT.AI API KEY HERE" # Wit.ai keys are 32-character uppercase alphanumeric strings +WIT_AI_KEY = "INSERT WIT.AI API KEY HERE" # Wit.ai keys are 32-character uppercase alphanumeric strings try: - from pprint import pprint print("Wit.ai recognition results:") - pprint(r.recognize_wit(audio, key=WIT_AI_KEY, show_all=True)) # pretty-print the recognition result + pprint(r.recognize_wit(audio, key=WIT_AI_KEY, show_all=True)) # pretty-print the recognition result except sr.UnknownValueError: print("Wit.ai could not understand audio") except sr.RequestError as e: print("Could not request results from Wit.ai service; {0}".format(e)) -# recognize speech using IBM Speech to Text -IBM_USERNAME = "INSERT IBM SPEECH TO TEXT USERNAME HERE" # IBM Speech to Text usernames are strings of the form XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX -IBM_PASSWORD = "INSERT IBM SPEECH TO TEXT PASSWORD HERE" # IBM Speech to Text passwords are mixed-case alphanumeric strings +# recognize speech using Microsoft Bing Voice Recognition +BING_KEY = "INSERT BING API KEY HERE" # Microsoft Bing Voice Recognition API keys 32-character lowercase hexadecimal strings try: - from pprint import pprint - print("IBM Speech to Text results:") - pprint(r.recognize_ibm(audio, username=IBM_USERNAME, password=IBM_PASSWORD, show_all=True)) # pretty-print the recognition result + print("Bing recognition results:") + pprint(r.recognize_bing(audio, key=BING_KEY, show_all=True)) except sr.UnknownValueError: - print("IBM Speech to Text could not understand audio") + print("Microsoft Bing Voice Recognition could not understand audio") except sr.RequestError as e: - print("Could not request results from IBM Speech to Text service; {0}".format(e)) + print("Could not request results from Microsoft Bing Voice Recognition service; {0}".format(e)) -# recognize speech using AT&T Speech to Text -ATT_APP_KEY = "INSERT AT&T SPEECH TO TEXT APP KEY HERE" # AT&T Speech to Text app keys are 32-character lowercase alphanumeric strings -ATT_APP_SECRET = "INSERT AT&T SPEECH TO TEXT APP SECRET HERE" # AT&T Speech to Text app secrets are 32-character lowercase alphanumeric strings +# recognize speech using Houndify +HOUNDIFY_CLIENT_ID = "INSERT HOUNDIFY CLIENT ID HERE" # Houndify client IDs are Base64-encoded strings +HOUNDIFY_CLIENT_KEY = "INSERT HOUNDIFY CLIENT KEY HERE" # Houndify client keys are Base64-encoded strings try: - print("AT&T Speech to Text thinks you said " + r.recognize_att(audio, app_key=ATT_APP_KEY, app_secret=ATT_APP_SECRET)) + print("Houndify recognition results:") + pprint(r.recognize_houndify(audio, client_id=HOUNDIFY_CLIENT_ID, client_key=HOUNDIFY_CLIENT_KEY, show_all=True)) except sr.UnknownValueError: - print("AT&T Speech to Text could not understand audio") + print("Houndify could not understand audio") except sr.RequestError as e: - print("Could not request results from AT&T Speech to Text service; {0}".format(e)) + print("Could not request results from Houndify service; {0}".format(e)) + +# recognize speech using IBM Speech to Text +IBM_USERNAME = "INSERT IBM SPEECH TO TEXT USERNAME HERE" # IBM Speech to Text usernames are strings of the form XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX +IBM_PASSWORD = "INSERT IBM SPEECH TO TEXT PASSWORD HERE" # IBM Speech to Text passwords are mixed-case alphanumeric strings +try: + print("IBM Speech to Text results:") + pprint(r.recognize_ibm(audio, username=IBM_USERNAME, password=IBM_PASSWORD, show_all=True)) # pretty-print the recognition result +except sr.UnknownValueError: + print("IBM Speech to Text could not understand audio") +except sr.RequestError as e: + print("Could not request results from IBM Speech to Text service; {0}".format(e)) diff --git a/examples/french.aiff b/examples/french.aiff new file mode 100644 index 00000000..31cd0d0f Binary files /dev/null and b/examples/french.aiff differ diff --git a/examples/french.wav b/examples/french.wav deleted file mode 100644 index c5bdd30e..00000000 Binary files a/examples/french.wav and /dev/null differ diff --git a/examples/microphone_recognition.py b/examples/microphone_recognition.py index 777cf4fd..e864e2a4 100644 --- a/examples/microphone_recognition.py +++ b/examples/microphone_recognition.py @@ -2,6 +2,8 @@ # NOTE: this example requires PyAudio because it uses the Microphone class +import os + import speech_recognition as sr # obtain audio from the microphone @@ -29,8 +31,17 @@ except sr.RequestError as e: print("Could not request results from Google Speech Recognition service; {0}".format(e)) +# recognize speech using Google Cloud Speech +# Before run, create local authentication credentials (``gcloud auth application-default login``) +try: + print("Google Cloud Speech thinks you said " + r.recognize_google_cloud(audio)) +except sr.UnknownValueError: + print("Google Cloud Speech could not understand audio") +except sr.RequestError as e: + print("Could not request results from Google Cloud Speech service; {0}".format(e)) + # recognize speech using Wit.ai -WIT_AI_KEY = "INSERT WIT.AI API KEY HERE" # Wit.ai keys are 32-character uppercase alphanumeric strings +WIT_AI_KEY = "INSERT WIT.AI API KEY HERE" # Wit.ai keys are 32-character uppercase alphanumeric strings try: print("Wit.ai thinks you said " + r.recognize_wit(audio, key=WIT_AI_KEY)) except sr.UnknownValueError: @@ -38,9 +49,37 @@ except sr.RequestError as e: print("Could not request results from Wit.ai service; {0}".format(e)) +# recognize speech using Microsoft Bing Voice Recognition +BING_KEY = "INSERT BING API KEY HERE" # Microsoft Bing Voice Recognition API keys 32-character lowercase hexadecimal strings +try: + print("Microsoft Bing Voice Recognition thinks you said " + r.recognize_bing(audio, key=BING_KEY)) +except sr.UnknownValueError: + print("Microsoft Bing Voice Recognition could not understand audio") +except sr.RequestError as e: + print("Could not request results from Microsoft Bing Voice Recognition service; {0}".format(e)) + +# recognize speech using Microsoft Azure Speech +AZURE_SPEECH_KEY = "INSERT AZURE SPEECH API KEY HERE" # Microsoft Speech API keys 32-character lowercase hexadecimal strings +try: + print("Microsoft Azure Speech thinks you said " + r.recognize_azure(audio, key=AZURE_SPEECH_KEY)) +except sr.UnknownValueError: + print("Microsoft Azure Speech could not understand audio") +except sr.RequestError as e: + print("Could not request results from Microsoft Azure Speech service; {0}".format(e)) + +# recognize speech using Houndify +HOUNDIFY_CLIENT_ID = "INSERT HOUNDIFY CLIENT ID HERE" # Houndify client IDs are Base64-encoded strings +HOUNDIFY_CLIENT_KEY = "INSERT HOUNDIFY CLIENT KEY HERE" # Houndify client keys are Base64-encoded strings +try: + print("Houndify thinks you said " + r.recognize_houndify(audio, client_id=HOUNDIFY_CLIENT_ID, client_key=HOUNDIFY_CLIENT_KEY)) +except sr.UnknownValueError: + print("Houndify could not understand audio") +except sr.RequestError as e: + print("Could not request results from Houndify service; {0}".format(e)) + # recognize speech using IBM Speech to Text -IBM_USERNAME = "INSERT IBM SPEECH TO TEXT USERNAME HERE" # IBM Speech to Text usernames are strings of the form XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX -IBM_PASSWORD = "INSERT IBM SPEECH TO TEXT PASSWORD HERE" # IBM Speech to Text passwords are mixed-case alphanumeric strings +IBM_USERNAME = "INSERT IBM SPEECH TO TEXT USERNAME HERE" # IBM Speech to Text usernames are strings of the form XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX +IBM_PASSWORD = "INSERT IBM SPEECH TO TEXT PASSWORD HERE" # IBM Speech to Text passwords are mixed-case alphanumeric strings try: print("IBM Speech to Text thinks you said " + r.recognize_ibm(audio, username=IBM_USERNAME, password=IBM_PASSWORD)) except sr.UnknownValueError: @@ -48,12 +87,18 @@ except sr.RequestError as e: print("Could not request results from IBM Speech to Text service; {0}".format(e)) -# recognize speech using AT&T Speech to Text -ATT_APP_KEY = "INSERT AT&T SPEECH TO TEXT APP KEY HERE" # AT&T Speech to Text app keys are 32-character lowercase alphanumeric strings -ATT_APP_SECRET = "INSERT AT&T SPEECH TO TEXT APP SECRET HERE" # AT&T Speech to Text app secrets are 32-character lowercase alphanumeric strings +# recognize speech using whisper try: - print("AT&T Speech to Text thinks you said " + r.recognize_att(audio, app_key=ATT_APP_KEY, app_secret=ATT_APP_SECRET)) + print("Whisper thinks you said " + r.recognize_whisper(audio, language="english")) except sr.UnknownValueError: - print("AT&T Speech to Text could not understand audio") + print("Whisper could not understand audio") +except sr.RequestError as e: + print(f"Could not request results from Whisper; {e}") + +# recognize speech using Whisper API +OPENAI_API_KEY = "INSERT OPENAI API KEY HERE" +os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY +try: + print(f"OpenAI Whisper API thinks you said {r.recognize_openai(audio)}") except sr.RequestError as e: - print("Could not request results from AT&T Speech to Text service; {0}".format(e)) + print(f"Could not request results from OpenAI Whisper API; {e}") diff --git a/examples/special_recognizer_features.py b/examples/special_recognizer_features.py new file mode 100644 index 00000000..1d051ede --- /dev/null +++ b/examples/special_recognizer_features.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 + +import speech_recognition as sr + +from os import path +AUDIO_FILE_EN = path.join(path.dirname(path.realpath(__file__)), "english.wav") +AUDIO_FILE_FR = path.join(path.dirname(path.realpath(__file__)), "french.aiff") + +# use the audio file as the audio source +r = sr.Recognizer() +with sr.AudioFile(AUDIO_FILE_EN) as source: + audio_en = r.record(source) # read the entire audio file +with sr.AudioFile(AUDIO_FILE_FR) as source: + audio_fr = r.record(source) # read the entire audio file + +# recognize keywords using Sphinx +try: + print("Sphinx recognition for \"one two three\" with different sets of keywords:") + print(r.recognize_sphinx(audio_en, keyword_entries=[("one", 1.0), ("two", 1.0), ("three", 1.0)])) + print(r.recognize_sphinx(audio_en, keyword_entries=[("wan", 0.95), ("too", 1.0), ("tree", 1.0)])) + print(r.recognize_sphinx(audio_en, keyword_entries=[("un", 0.95), ("to", 1.0), ("tee", 1.0)])) +except sr.UnknownValueError: + print("Sphinx could not understand audio") +except sr.RequestError as e: + print("Sphinx error; {0}".format(e)) + +# grammar example using Sphinx +try: + print("Sphinx recognition for \"one two three\" for counting grammar:") + print(r.recognize_sphinx(audio_en, grammar='counting.gram')) +except sr.UnknownValueError: + print("Sphinx could not understand audio") +except sr.RequestError as e: + print("Sphinx error; {0}".format(e)) + + +# recognize preferred phrases using Google Cloud Speech +# Before run, create local authentication credentials (``gcloud auth application-default login``) +try: + print("Google Cloud Speech recognition for \"numero\" with different sets of preferred phrases:") + print(r.recognize_google_cloud(audio_fr, preferred_phrases=["noomarow"])) + print(r.recognize_google_cloud(audio_fr, preferred_phrases=["newmarrow"])) +except sr.UnknownValueError: + print("Google Cloud Speech could not understand audio") +except sr.RequestError as e: + print("Could not request results from Google Cloud Speech service; {0}".format(e)) diff --git a/examples/tensorflow_commands.py b/examples/tensorflow_commands.py new file mode 100644 index 00000000..50306c6d --- /dev/null +++ b/examples/tensorflow_commands.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +import time +import speech_recognition as sr +from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio # noqa + +# obtain audio from the microphone +r = sr.Recognizer() +m = sr.Microphone() + +with m as source: + r.adjust_for_ambient_noise(source) + + +def callback(recognizer, audio): + try: + # You can download the data here: http://download.tensorflow.org/models/speech_commands_v0.01.zip + spoken = recognizer.recognize_tensorflow(audio, tensor_graph='speech_recognition/tensorflow-data/conv_actions_frozen.pb', tensor_label='speech_recognition/tensorflow-data/conv_actions_labels.txt') + print(spoken) + except sr.UnknownValueError: + print("Tensorflow could not understand audio") + except sr.RequestError as e: + print("Could not request results from Tensorflow service; {0}".format(e)) + + +stop_listening = r.listen_in_background(m, callback, phrase_time_limit=0.6) +time.sleep(100) diff --git a/examples/threaded_workers.py b/examples/threaded_workers.py new file mode 100644 index 00000000..3a439db0 --- /dev/null +++ b/examples/threaded_workers.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 + +# NOTE: this example requires PyAudio because it uses the Microphone class + +from threading import Thread +from queue import Queue + +import speech_recognition as sr + + +r = sr.Recognizer() +audio_queue = Queue() + + +def recognize_worker(): + # this runs in a background thread + while True: + audio = audio_queue.get() # retrieve the next audio processing job from the main thread + if audio is None: break # stop processing if the main thread is done + + # received audio data, now we'll recognize it using Google Speech Recognition + try: + # for testing purposes, we're just using the default API key + # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")` + # instead of `r.recognize_google(audio)` + print("Google Speech Recognition thinks you said " + r.recognize_google(audio)) + except sr.UnknownValueError: + print("Google Speech Recognition could not understand audio") + except sr.RequestError as e: + print("Could not request results from Google Speech Recognition service; {0}".format(e)) + + audio_queue.task_done() # mark the audio processing job as completed in the queue + + +# start a new thread to recognize audio, while this thread focuses on listening +recognize_thread = Thread(target=recognize_worker) +recognize_thread.daemon = True +recognize_thread.start() +with sr.Microphone() as source: + try: + while True: # repeatedly listen for phrases and put the resulting audio on the audio processing job queue + audio_queue.put(r.listen(source)) + except KeyboardInterrupt: # allow Ctrl + C to shut down the program + pass + +audio_queue.join() # block until all current audio processing jobs are done +audio_queue.put(None) # tell the recognize_thread to stop +recognize_thread.join() # wait for the recognize_thread to actually stop diff --git a/examples/wav_transcribe.py b/examples/wav_transcribe.py deleted file mode 100644 index d3ddda97..00000000 --- a/examples/wav_transcribe.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python3 - -import speech_recognition as sr - -# obtain path to "english.wav" in the same folder as this script -from os import path -WAV_FILE = path.join(path.dirname(path.realpath(__file__)), "english.wav") - -# use "english.wav" as the audio source -r = sr.Recognizer() -with sr.WavFile(WAV_FILE) as source: - audio = r.record(source) # read the entire WAV file - -# recognize speech using Sphinx -try: - print("Sphinx thinks you said " + r.recognize_sphinx(audio)) -except sr.UnknownValueError: - print("Sphinx could not understand audio") -except sr.RequestError as e: - print("Sphinx error; {0}".format(e)) - -# recognize speech using Google Speech Recognition -try: - # for testing purposes, we're just using the default API key - # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")` - # instead of `r.recognize_google(audio)` - print("Google Speech Recognition thinks you said " + r.recognize_google(audio)) -except sr.UnknownValueError: - print("Google Speech Recognition could not understand audio") -except sr.RequestError as e: - print("Could not request results from Google Speech Recognition service; {0}".format(e)) - -# recognize speech using Wit.ai -WIT_AI_KEY = "INSERT WIT.AI API KEY HERE" # Wit.ai keys are 32-character uppercase alphanumeric strings -try: - print("Wit.ai thinks you said " + r.recognize_wit(audio, key=WIT_AI_KEY)) -except sr.UnknownValueError: - print("Wit.ai could not understand audio") -except sr.RequestError as e: - print("Could not request results from Wit.ai service; {0}".format(e)) - -# recognize speech using IBM Speech to Text -IBM_USERNAME = "INSERT IBM SPEECH TO TEXT USERNAME HERE" # IBM Speech to Text usernames are strings of the form XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX -IBM_PASSWORD = "INSERT IBM SPEECH TO TEXT PASSWORD HERE" # IBM Speech to Text passwords are mixed-case alphanumeric strings -try: - print("IBM Speech to Text thinks you said " + r.recognize_ibm(audio, username=IBM_USERNAME, password=IBM_PASSWORD)) -except sr.UnknownValueError: - print("IBM Speech to Text could not understand audio") -except sr.RequestError as e: - print("Could not request results from IBM Speech to Text service; {0}".format(e)) - -# recognize speech using AT&T Speech to Text -ATT_APP_KEY = "INSERT AT&T SPEECH TO TEXT APP KEY HERE" # AT&T Speech to Text app keys are 32-character lowercase alphanumeric strings -ATT_APP_SECRET = "INSERT AT&T SPEECH TO TEXT APP SECRET HERE" # AT&T Speech to Text app secrets are 32-character lowercase alphanumeric strings -try: - print("AT&T Speech to Text thinks you said " + r.recognize_att(audio, app_key=ATT_APP_KEY, app_secret=ATT_APP_SECRET)) -except sr.UnknownValueError: - print("AT&T Speech to Text could not understand audio") -except sr.RequestError as e: - print("Could not request results from AT&T Speech to Text service; {0}".format(e)) diff --git a/examples/write_audio.py b/examples/write_audio.py index 55982f7a..0caf946a 100644 --- a/examples/write_audio.py +++ b/examples/write_audio.py @@ -10,6 +10,18 @@ print("Say something!") audio = r.listen(source) +# write audio to a RAW file +with open("microphone-results.raw", "wb") as f: + f.write(audio.get_raw_data()) + # write audio to a WAV file with open("microphone-results.wav", "wb") as f: f.write(audio.get_wav_data()) + +# write audio to an AIFF file +with open("microphone-results.aiff", "wb") as f: + f.write(audio.get_aiff_data()) + +# write audio to a FLAC file +with open("microphone-results.flac", "wb") as f: + f.write(audio.get_flac_data()) diff --git a/make-release.sh b/make-release.sh new file mode 100644 index 00000000..f737fb20 --- /dev/null +++ b/make-release.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +# set up bash to handle errors more aggressively - a "strict mode" of sorts +set -e # give an error if any command finishes with a non-zero exit code +set -u # give an error if we reference unset variables +set -o pipefail # for a pipeline, if any of the commands fail with a non-zero exit code, fail the entire pipeline with that exit code + +echo "Making release for SpeechRecognition-$1" + +python setup.py bdist_wheel +gpg --detach-sign -a dist/SpeechRecognition-$1-*.whl +twine upload dist/SpeechRecognition-$1-*.whl dist/SpeechRecognition-$1-*.whl.asc diff --git a/reference/library-reference.rst b/reference/library-reference.rst new file mode 100644 index 00000000..0c0d69bc --- /dev/null +++ b/reference/library-reference.rst @@ -0,0 +1,376 @@ +Speech Recognition Library Reference +==================================== + +``Microphone(device_index: Union[int, None] = None, sample_rate: int = 16000, chunk_size: int = 1024) -> Microphone`` +--------------------------------------------------------------------------------------------------------------------- + +Creates a new ``Microphone`` instance, which represents a physical microphone on the computer. Subclass of ``AudioSource``. + +This will throw an ``AttributeError`` if you don't have PyAudio (0.2.11 or later) installed. + +If ``device_index`` is unspecified or ``None``, the default microphone is used as the audio source. Otherwise, ``device_index`` should be the index of the device to use for audio input. + +A device index is an integer between 0 and ``pyaudio.get_device_count() - 1`` (assume we have used ``import pyaudio`` beforehand) inclusive. It represents an audio device such as a microphone or speaker. See the `PyAudio documentation `__ for more details. + +The microphone audio is recorded in chunks of ``chunk_size`` samples, at a rate of ``sample_rate`` samples per second (Hertz). + +Higher ``sample_rate`` values result in better audio quality, but also more bandwidth (and therefore, slower recognition). Additionally, some machines, such as some Raspberry Pi models, can't keep up if this value is too high. + +Higher ``chunk_size`` values help avoid triggering on rapidly changing ambient noise, but also makes detection less sensitive. This value, generally, should be left at its default. + +Instances of this class are context managers, and are designed to be used with ``with`` statements: + +.. code:: python + + with Microphone() as source: # open the microphone and start recording + pass # do things here - ``source`` is the Microphone instance created above + # the microphone is automatically released at this point + +``Microphone.list_microphone_names() -> List[str]`` +--------------------------------------------------- + +Returns a list of the names of all available microphones. For microphones where the name can't be retrieved, the list entry contains ``None`` instead. + +The index of each microphone's name in the returned list is the same as its device index when creating a ``Microphone`` instance - if you want to use the microphone at index 3 in the returned list, use ``Microphone(device_index=3)``. + +To create a ``Microphone`` instance by name: + +.. code:: python + + m = None + for i, microphone_name in enumerate(Microphone.list_microphone_names()): + if microphone_name == "HDA Intel HDMI: 0 (hw:0,3)": + m = Microphone(device_index=i) + +``Microphone.list_working_microphones() -> Dict[int, str]`` +----------------------------------------------------------- + +Returns a dictionary mapping device indices to microphone names, for microphones that are currently hearing sounds. When using this function, ensure that your microphone is unmuted and make some noise at it to ensure it will be detected as working. + +Each key in the returned dictionary can be passed to the ``Microphone`` constructor to use that microphone. For example, if the return value is ``{3: "HDA Intel PCH: ALC3232 Analog (hw:1,0)"}``, you can do ``Microphone(device_index=3)`` to use that microphone. + +To create a ``Microphone`` instance for the first working microphone: + +.. code:: python + + for device_index in Microphone.list_working_microphones(): + m = Microphone(device_index=device_index) + break + else: + print("No working microphones found!") + +``AudioFile(filename_or_fileobject: Union[str, io.IOBase]) -> AudioFile`` +------------------------------------------------------------------------- + +Creates a new ``AudioFile`` instance given a WAV/AIFF/FLAC audio file ``filename_or_fileobject``. Subclass of ``AudioSource``. + +If ``filename_or_fileobject`` is a string, then it is interpreted as a path to an audio file on the filesystem. Otherwise, ``filename_or_fileobject`` should be a file-like object such as ``io.BytesIO`` or similar. + +Note that functions that read from the audio (such as ``recognizer_instance.record`` or ``recognizer_instance.listen``) will move ahead in the stream. For example, if you execute ``recognizer_instance.record(audiofile_instance, duration=10)`` twice, the first time it will return the first 10 seconds of audio, and the second time it will return the 10 seconds of audio right after that. This is always reset when entering the context with a context manager. + +WAV files must be in PCM/LPCM format; WAVE_FORMAT_EXTENSIBLE and compressed WAV are not supported and may result in undefined behaviour. + +Both AIFF and AIFF-C (compressed AIFF) formats are supported. + +FLAC files must be in native FLAC format; OGG-FLAC is not supported and may result in undefined behaviour. + +Instances of this class are context managers, and are designed to be used with ``with`` statements: + +.. code:: python + + import speech_recognition as sr + with sr.AudioFile("SOME_AUDIO_FILE") as source: # open the audio file for reading + pass # do things here - ``source`` is the AudioFile instance created above + +``audiofile_instance.DURATION # type: float`` +---------------------------------------------- + +Represents the length of the audio stored in the audio file in seconds. This property is only available when inside a context - essentially, that means it should only be accessed inside the body of a ``with audiofile_instance ...`` statement. Outside of contexts, this property is ``None``. + +This is useful when combined with the ``offset`` parameter of ``recognizer_instance.record``, since when together it is possible to perform speech recognition in chunks. + +However, note that recognizing speech in multiple chunks is not the same as recognizing the whole thing at once. If spoken words appear on the boundaries that we split the audio into chunks on, each chunk only gets part of the word, which may result in inaccurate results. + +``Recognizer() -> Recognizer`` +------------------------------ + +Creates a new ``Recognizer`` instance, which represents a collection of speech recognition settings and functionality. + +``recognizer_instance.energy_threshold = 300 # type: float`` +------------------------------------------------------------- + +Represents the energy level threshold for sounds. Values below this threshold are considered silence, and values above this threshold are considered speech. Can be changed. + +This is adjusted automatically if dynamic thresholds are enabled (see ``recognizer_instance.dynamic_energy_threshold``). A good starting value will generally allow the automatic adjustment to reach a good value faster. + +This threshold is associated with the perceived loudness of the sound, but it is a nonlinear relationship. The actual energy threshold you will need depends on your microphone sensitivity or audio data. Typical values for a silent room are 0 to 100, and typical values for speaking are between 150 and 3500. Ambient (non-speaking) noise has a significant impact on what values will work best. + +If you're having trouble with the recognizer trying to recognize words even when you're not speaking, try tweaking this to a higher value. If you're having trouble with the recognizer not recognizing your words when you are speaking, try tweaking this to a lower value. For example, a sensitive microphone or microphones in louder rooms might have a ambient energy level of up to 4000: + +.. code:: python + + import speech_recognition as sr + r = sr.Recognizer() + r.energy_threshold = 4000 + # rest of your code goes here + +The dynamic energy threshold setting can mitigate this by increasing or decreasing this automatically to account for ambient noise. However, this takes time to adjust, so it is still possible to get the false positive detections before the threshold settles into a good value. + +To avoid this, use ``recognizer_instance.adjust_for_ambient_noise(source, duration = 1)`` to calibrate the level to a good value. Alternatively, simply set this property to a high value initially (4000 works well), so the threshold is always above ambient noise levels: over time, it will be automatically decreased to account for ambient noise levels. + +``recognizer_instance.dynamic_energy_threshold = True # type: bool`` +--------------------------------------------------------------------- + +Represents whether the energy level threshold (see ``recognizer_instance.energy_threshold``) for sounds should be automatically adjusted based on the currently ambient noise level while listening. Can be changed. + +Recommended for situations where the ambient noise level is unpredictable, which seems to be the majority of use cases. If the ambient noise level is strictly controlled, better results might be achieved by setting this to ``False`` to turn it off. + +``recognizer_instance.dynamic_energy_adjustment_damping = 0.15 # type: float`` +------------------------------------------------------------------------------- + +If the dynamic energy threshold setting is enabled (see ``recognizer_instance.dynamic_energy_threshold``), represents approximately the fraction of the current energy threshold that is retained after one second of dynamic threshold adjustment. Can be changed (not recommended). + +Lower values allow for faster adjustment, but also make it more likely to miss certain phrases (especially those with slowly changing volume). This value should be between 0 and 1. As this value approaches 1, dynamic adjustment has less of an effect over time. When this value is 1, dynamic adjustment has no effect. + +``recognizer_instance.dynamic_energy_adjustment_ratio = 1.5 # type: float`` +---------------------------------------------------------------------------- + +If the dynamic energy threshold setting is enabled (see ``recognizer_instance.dynamic_energy_threshold``), represents the minimum factor by which speech is louder than ambient noise. Can be changed (not recommended). + +For example, the default value of 1.5 means that speech is at least 1.5 times louder than ambient noise. Smaller values result in more false positives (but fewer false negatives) when ambient noise is loud compared to speech. + +``recognizer_instance.pause_threshold = 0.8 # type: float`` +------------------------------------------------------------ + +Represents the minimum length of silence (in seconds) that will register as the end of a phrase. Can be changed. + +Smaller values result in the recognition completing more quickly, but might result in slower speakers being cut off. + +``recognizer_instance.operation_timeout = None # type: Union[float, None]`` +---------------------------------------------------------------------------- + +Represents the timeout (in seconds) for internal operations, such as API requests. Can be changed. + +Setting this to a reasonable value ensures that these operations will never block indefinitely, though good values depend on your network speed and the expected length of the audio to recognize. + +``recognizer_instance.record(source: AudioSource, duration: Union[float, None] = None, offset: Union[float, None] = None) -> AudioData`` +---------------------------------------------------------------------------------------------------------------------------------------- + +Records up to ``duration`` seconds of audio from ``source`` (an ``AudioSource`` instance) starting at ``offset`` (or at the beginning if not specified) into an ``AudioData`` instance, which it returns. + +If ``duration`` is not specified, then it will record until there is no more audio input. + +``recognizer_instance.adjust_for_ambient_noise(source: AudioSource, duration: float = 1) -> None`` +-------------------------------------------------------------------------------------------------- + +Adjusts the energy threshold dynamically using audio from ``source`` (an ``AudioSource`` instance) to account for ambient noise. + +Intended to calibrate the energy threshold with the ambient energy level. Should be used on periods of audio without speech - will stop early if any speech is detected. + +The ``duration`` parameter is the maximum number of seconds that it will dynamically adjust the threshold for before returning. This value should be at least 0.5 in order to get a representative sample of the ambient noise. + +``recognizer_instance.listen(source: AudioSource, timeout: Union[float, None] = None, phrase_time_limit: Union[float, None] = None, snowboy_configuration: Union[Tuple[str, Iterable[str]], None] = None) -> AudioData`` +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ + +Records a single phrase from ``source`` (an ``AudioSource`` instance) into an ``AudioData`` instance, which it returns. + +This is done by waiting until the audio has an energy above ``recognizer_instance.energy_threshold`` (the user has started speaking), and then recording until it encounters ``recognizer_instance.pause_threshold`` seconds of non-speaking or there is no more audio input. The ending silence is not included. + +The ``timeout`` parameter is the maximum number of seconds that this will wait for a phrase to start before giving up and throwing an ``speech_recognition.WaitTimeoutError`` exception. If ``timeout`` is ``None``, there will be no wait timeout. + +The ``phrase_time_limit`` parameter is the maximum number of seconds that this will allow a phrase to continue before stopping and returning the part of the phrase processed before the time limit was reached. The resulting audio will be the phrase cut off at the time limit. If ``phrase_timeout`` is ``None``, there will be no phrase time limit. + +The ``snowboy_configuration`` parameter allows integration with `Snowboy `__, an offline, high-accuracy, power-efficient hotword recognition engine. When used, this function will pause until Snowboy detects a hotword, after which it will unpause. This parameter should either be ``None`` to turn off Snowboy support, or a tuple of the form ``(SNOWBOY_LOCATION, LIST_OF_HOT_WORD_FILES)``, where ``SNOWBOY_LOCATION`` is the path to the Snowboy root directory, and ``LIST_OF_HOT_WORD_FILES`` is a list of paths to Snowboy hotword configuration files (`*.pmdl` or `*.umdl` format). + +This operation will always complete within ``timeout + phrase_timeout`` seconds if both are numbers, either by returning the audio data, or by raising a ``speech_recognition.WaitTimeoutError`` exception. + +``recognizer_instance.listen_in_background(source: AudioSource, callback: Callable[[Recognizer, AudioData], Any]) -> Callable[bool, None]`` +------------------------------------------------------------------------------------------------------------------------------------------- + +Spawns a thread to repeatedly record phrases from ``source`` (an ``AudioSource`` instance) into an ``AudioData`` instance and call ``callback`` with that ``AudioData`` instance as soon as each phrase are detected. + +Returns a function object that, when called, requests that the background listener thread stop. The background thread is a daemon and will not stop the program from exiting if there are no other non-daemon threads. The function accepts one parameter, ``wait_for_stop``: if truthy, the function will wait for the background listener to stop before returning, otherwise it will return immediately and the background listener thread might still be running for a second or two afterwards. Additionally, if you are using a truthy value for ``wait_for_stop``, you must call the function from the same thread you originally called ``listen_in_background`` from. + +Phrase recognition uses the exact same mechanism as ``recognizer_instance.listen(source)``. The ``phrase_time_limit`` parameter works in the same way as the ``phrase_time_limit`` parameter for ``recognizer_instance.listen(source)``, as well. + +The ``callback`` parameter is a function that should accept two parameters - the ``recognizer_instance``, and an ``AudioData`` instance representing the captured audio. Note that ``callback`` function will be called from a non-main thread. + +``recognizer_instance.recognize_sphinx(audio_data: AudioData, language: str = "en-US", keyword_entries: Union[Iterable[Tuple[str, float]], None] = None, grammar: Union[str, None] = None, show_all: bool = False) -> Union[str, pocketsphinx.pocketsphinx.Decoder]`` +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + +.. autofunction:: speech_recognition.recognizers.pocketsphinx.recognize + +``recognizer_instance.recognize_google(audio_data: AudioData, key: Union[str, None] = None, language: str = "en-US", , pfilter: Union[0, 1], show_all: bool = False) -> Union[str, Dict[str, Any]]`` +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + +Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Speech Recognition API. + +The Google Speech Recognition API key is specified by ``key``. If not specified, it uses a generic key that works out of the box. This should generally be used for personal or testing purposes only, as it **may be revoked by Google at any time**. + +To obtain your own API key, simply follow the steps on the `API Keys `__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API". Note that **the API quota for your own keys is 50 requests per day**, and there is currently no way to raise this limit. + +The recognition language is determined by ``language``, an IETF language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. A list of supported language tags can be found `here `__. Basically, language codes can be just the language (``en``), or a language with a dialect (``en-US``). + +The profanity filter level can be adjusted with ``pfilter``: 0 - No filter, 1 - Only shows the first character and replaces the rest with asterisks. The default is level 0. + +Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary. + +Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. + +``recognizer_instance.recognize_google_cloud(audio_data: AudioData, credentials_json_path: Union[str, None] = None, **kwargs) -> Union[str, Dict[str, Any]]`` +------------------------------------------------------------------------------------------------------------------------------------------------------------- + +.. autofunction:: speech_recognition.recognizers.google_cloud.recognize + +``recognizer_instance.recognize_wit(audio_data: AudioData, key: str, show_all: bool = False) -> Union[str, Dict[str, Any]]`` +---------------------------------------------------------------------------------------------------------------------------- + +Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Wit.ai API. + +The Wit.ai API key is specified by ``key``. Unfortunately, these are not available without `signing up for an account `__ and creating an app. You will need to add at least one intent to the app before you can see the API key, though the actual intent settings don't matter. + +To get the API key for a Wit.ai app, go to the app's overview page, go to the section titled "Make an API request", and look for something along the lines of ``Authorization: Bearer XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX``; ``XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX`` is the API key. Wit.ai API keys are 32-character uppercase alphanumeric strings. + +The recognition language is configured in the Wit.ai app settings. + +Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response `__ as a JSON dictionary. + +Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. + +``recognizer_instance.recognize_bing(audio_data: AudioData, key: str, language: str = "en-US", show_all: bool = False) -> Union[str, Dict[str, Any]]`` +------------------------------------------------------------------------------------------------------------------------------------------------------ + +Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Microsoft Bing Speech API. + +The Microsoft Bing Speech API key is specified by ``key``. Unfortunately, these are not available without `signing up for an account `__ with Microsoft Azure. + +To get the API key, go to the `Microsoft Azure Portal Resources `__ page, go to "All Resources" > "Add" > "See All" > Search "Bing Speech API > "Create", and fill in the form to make a "Bing Speech API" resource. On the resulting page (which is also accessible from the "All Resources" page in the Azure Portal), go to the "Show Access Keys" page, which will have two API keys, either of which can be used for the `key` parameter. Microsoft Bing Speech API keys are 32-character lowercase hexadecimal strings. + +The recognition language is determined by ``language``, a BCP-47 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language values can be found in the `API documentation `__ under "Interactive and dictation mode". + +Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response `__ as a JSON dictionary. + +Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. + +``recognizer_instance.recognize_houndify(audio_data: AudioData, client_id: str, client_key: str, show_all: bool = False) -> Union[str, Dict[str, Any]]`` +-------------------------------------------------------------------------------------------------------------------------------------------------------- + +Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Houndify API. + +The Houndify client ID and client key are specified by ``client_id`` and ``client_key``, respectively. Unfortunately, these are not available without `signing up for an account `__. Once logged into the `dashboard `__, you will want to select "Register a new client", and fill in the form as necessary. When at the "Enable Domains" page, enable the "Speech To Text Only" domain, and then select "Save & Continue". + +To get the client ID and client key for a Houndify client, go to the `dashboard `__ and select the client's "View Details" link. On the resulting page, the client ID and client key will be visible. Client IDs and client keys are both Base64-encoded strings. + +Currently, only English is supported as a recognition language. + +Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary. + +Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. + +``recognizer_instance.recognize_ibm(audio_data: AudioData, username: str, password: str, language: str = "en-US", show_all: bool = False) -> Union[str, Dict[str, Any]]`` +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + +Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the IBM Speech to Text API. + +The IBM Speech to Text username and password are specified by ``username`` and ``password``, respectively. Unfortunately, these are not available without `signing up for an account `__. Once logged into the Bluemix console, follow the instructions for `creating an IBM Watson service instance `__, where the Watson service is "Speech To Text". IBM Speech to Text usernames are strings of the form XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX, while passwords are mixed-case alphanumeric strings. + +The recognition language is determined by ``language``, an RFC5646 language tag with a dialect like ``"en-US"`` (US English) or ``"zh-CN"`` (Mandarin Chinese), defaulting to US English. The supported language values are listed under the ``model`` parameter of the `audio recognition API documentation `__, in the form ``LANGUAGE_BroadbandModel``, where ``LANGUAGE`` is the language value. + +Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response `__ as a JSON dictionary. + +Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. + +``recognizer_instance.recognize_whisper(audio_data: AudioData, model: str="base", show_dict: bool=False, load_options=None, **transcribe_options)`` +--------------------------------------------------------------------------------------------------------------------------------------------------- + +.. autofunction:: speech_recognition.recognizers.whisper_local.whisper.recognize + +``recognizer_instance.recognize_faster_whisper(audio_data: AudioData, model: str="base", show_dict: bool=False, **transcribe_options)`` +--------------------------------------------------------------------------------------------------------------------------------------- + +.. autofunction:: speech_recognition.recognizers.whisper_local.faster_whisper.recognize + +``recognizer_instance.recognize_openai(audio_data: AudioData, model = "whisper-1", **kwargs)`` +---------------------------------------------------------------------------------------------- + +.. autofunction:: speech_recognition.recognizers.whisper_api.openai.recognize + +``recognizer_instance.recognize_groq(audio_data: AudioData, model = "whisper-large-v3-turbo", **kwargs)`` +--------------------------------------------------------------------------------------------------------- + +.. autofunction:: speech_recognition.recognizers.whisper_api.groq.recognize + +``AudioSource`` +--------------- + +Base class representing audio sources. Do not instantiate. + +Instances of subclasses of this class, such as ``Microphone`` and ``AudioFile``, can be passed to things like ``recognizer_instance.record`` and ``recognizer_instance.listen``. Those instances act like context managers, and are designed to be used with ``with`` statements. + +For more information, see the documentation for the individual subclasses. + +``AudioData(frame_data: bytes, sample_rate: int, sample_width: int) -> AudioData`` +---------------------------------------------------------------------------------- + +Creates a new ``AudioData`` instance, which represents mono audio data. + +The raw audio data is specified by ``frame_data``, which is a sequence of bytes representing audio samples. This is the frame data structure used by the PCM WAV format. + +The width of each sample, in bytes, is specified by ``sample_width``. Each group of ``sample_width`` bytes represents a single audio sample. + +The audio data is assumed to have a sample rate of ``sample_rate`` samples per second (Hertz). + +Usually, instances of this class are obtained from ``recognizer_instance.record`` or ``recognizer_instance.listen``, or in the callback for ``recognizer_instance.listen_in_background``, rather than instantiating them directly. + +``audiodata_instance.get_segment(start_ms: Union[float, None] = None, end_ms: Union[float, None] = None) -> AudioData`` +----------------------------------------------------------------------------------------------------------------------- + +Returns a new ``AudioData`` instance, trimmed to a given time interval. In other words, an ``AudioData`` instance with the same audio data except starting at ``start_ms`` milliseconds in and ending ``end_ms`` milliseconds in. + +If not specified, ``start_ms`` defaults to the beginning of the audio, and ``end_ms`` defaults to the end. + +``audiodata_instance.get_raw_data(convert_rate: Union[int, None] = None, convert_width: Union[int, None] = None) -> bytes`` +--------------------------------------------------------------------------------------------------------------------------- + +Returns a byte string representing the raw frame data for the audio represented by the ``AudioData`` instance. + +If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. + +If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. + +Writing these bytes directly to a file results in a valid `RAW/PCM audio file `__. + +``audiodata_instance.get_wav_data(convert_rate: Union[int, None] = None, convert_width: Union[int, None] = None) -> bytes`` +--------------------------------------------------------------------------------------------------------------------------- + +Returns a byte string representing the contents of a WAV file containing the audio represented by the ``AudioData`` instance. + +If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. + +If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. + +Writing these bytes directly to a file results in a valid `WAV file `__. + +``audiodata_instance.get_aiff_data(convert_rate: Union[int, None] = None, convert_width: Union[int, None] = None) -> bytes`` +---------------------------------------------------------------------------------------------------------------------------- + +Returns a byte string representing the contents of an AIFF-C file containing the audio represented by the ``AudioData`` instance. + +If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. + +If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. + +Writing these bytes directly to a file results in a valid `AIFF-C file `__. + +``audiodata_instance.get_flac_data(convert_rate: Union[int, None] = None, convert_width: Union[int, None] = None) -> bytes`` +---------------------------------------------------------------------------------------------------------------------------- + +Returns a byte string representing the contents of a FLAC file containing the audio represented by the ``AudioData`` instance. + +Note that 32-bit FLAC is not supported. If the audio data is 32-bit and ``convert_width`` is not specified, then the resulting FLAC will be a 24-bit FLAC. + +If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. + +If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. + +Writing these bytes directly to a file results in a valid `FLAC file `__. diff --git a/reference/pocketsphinx.rst b/reference/pocketsphinx.rst new file mode 100644 index 00000000..c6568c53 --- /dev/null +++ b/reference/pocketsphinx.rst @@ -0,0 +1,113 @@ +Notes on using PocketSphinx +=========================== + +Installing other languages +-------------------------- + +By default, SpeechRecognition's Sphinx functionality supports only US English. Additional language packs are also available, but not included due to the files being too large: + +* `International French `__ +* `Mandarin Chinese `__ +* `Italian `__ + +To install a language pack, download the ZIP archives and extract them directly into the module install directory (you can find the module install directory by running ``python -c "import speech_recognition as sr, os.path as p; print(p.dirname(sr.__file__))"``). + +Here is a simple Bash script to install all of them, assuming you've downloaded all three ZIP files into your current directory: + +.. code:: bash + + #!/usr/bin/env bash + SR_LIB=$(python -c "import speech_recognition as sr, os.path as p; print(p.dirname(sr.__file__))") + sudo apt-get install --yes unzip + sudo unzip -o fr-FR.zip -d "$SR_LIB" + sudo chmod --recursive a+r "$SR_LIB/pocketsphinx-data/fr-FR/" + sudo unzip -o zh-CN.zip -d "$SR_LIB" + sudo chmod --recursive a+r "$SR_LIB/pocketsphinx-data/zh-CN/" + sudo unzip -o it-IT.zip -d "$SR_LIB" + sudo chmod --recursive a+r "$SR_LIB/pocketsphinx-data/it-IT/" + +Once installed, you can simply specify the language using the ``language`` parameter of ``recognizer_instance.recognize_sphinx``. For example, French would be specified with ``"fr-FR"`` and Mandarin with ``"zh-CN"``. + +Building PocketSphinx-Python from source +---------------------------------------- + +For Linux and other POSIX systems (like OS X), you'll want to build from source. It should take less than two minutes on a fast machine. + +* On any Debian-derived Linux distributions (like Ubuntu and Mint): + 1. Run ``sudo apt-get install python3 python3-all-dev python3-pip build-essential swig git libpulse-dev libasound2-dev`` for Python 3. + 2. Run ``pip3 install pocketsphinx`` for Python 3. +* On OS X: + 1. Run ``brew install swig git python3`` for Python 3. + 2. Install PocketSphinx-Python using Pip: ``pip install pocketsphinx``. + * If this gives errors when importing the library in your program, try running ``brew link --overwrite python``. +* On other POSIX-based systems: + 1. Install `Python `__, `Pip `__, `SWIG `__, and `Git `__, preferably using a package manager. + 2. Install PocketSphinx-Python using Pip: ``pip install pocketsphinx``. +* On Windows: + 1. Install `Python `__, `Pip `__, `SWIG `__, and `Git `__, preferably using a package manager. + 2. Add the folders containing the Python, SWIG, and Git binaries to your ``PATH`` environment variable. + * My ``PATH`` environment variable looks something like: ``C:\Users\Anthony\Desktop\swigwin-3.0.8;C:\Program Files\Git\cmd;(A BUNCH OF OTHER PATHS)``. + 3. Reboot to apply changes. + 4. Download the full PocketSphinx-Python source code by running ``git clone --recursive --depth 1 https://github.com/cmusphinx/pocketsphinx-python`` (downloading the ZIP archive from GitHub will not work). + 5. Run ``python setup.py install`` in the PocketSphinx-Python source code folder to compile and install PocketSphinx. + 6. Side note: when I build the precompiled Wheel packages, I skip steps 5 and 6 and do the following instead: + * For Python 3.4: ``C:\Python34\python.exe setup.py bdist_wheel``. + * For Python 3.5: ``C:\Users\Anthony\AppData\Local\Programs\Python\Python35\python.exe setup.py bdist_wheel``. + * The resulting packages are located in the ``dist`` folder of the PocketSphinx-Python project directory. + +Notes on the structure of the language data +------------------------------------------- + +* Every language has its own folder under ``/speech_recognition/pocketsphinx-data/LANGUAGE_NAME/``, where ``LANGUAGE_NAME`` is the IETF language tag, like ``"en-US"`` (US English) or ``"en-GB"`` (UK English). + * For example, the US English data is stored in ``/speech_recognition/pocketsphinx-data/en-US/``. + * The ``language`` parameter of ``recognizer_instance.recognize_sphinx`` simply chooses the folder with the given name. +* Languages are composed of 3 parts: + * An acoustic model ``/speech_recognition/pocketsphinx-data/LANGUAGE_NAME/acoustic-model/``, which describes how to interpret audio data. + * Acoustic models can be downloaded from the `CMU Sphinx files `__. These are pretty disorganized, but instructions for cleaning up specific versions are listed below. + * All of these should be 16 kHz (broadband) models, since that's what the library will assume is being used. + * A language model ``/speech_recognition/pocketsphinx-data/LANGUAGE_NAME/language-model.lm.bin`` (in `CMU binary format `__). + * A pronounciation dictionary ``/speech_recognition/pocketsphinx-data/LANGUAGE_NAME/pronounciation-dictionary.dict``, which describes how words in the language are pronounced. + +Notes on building the language data from source +----------------------------------------------- + +* All of the following points assume a Debian-derived Linux Distibution (like Ubuntu or Mint). +* To work with any complete, real-world languages, you will need quite a bit of RAM (16 GB recommended) and a fair bit of disk space (20 GB recommended). +* `SphinxBase `__ is needed for all language model file format conversions. We use it to convert between ``*.dmp`` DMP files (an obselete Sphinx binary format), ``*.lm`` ARPA files, and Sphinx binary ``*.lm.bin`` files: + * Install all the SphinxBase build dependencies with ``sudo apt-get install build-essential automake autotools-dev autoconf libtool``. + * Download and extract the `SphinxBase source code `__. + * Follow the instructions in the README to install SphinxBase. Basically, run ``sh autogen.sh --force && ./configure && make && sudo make install`` in the SphinxBase folder. +* Pruning (getting rid of less important information) is useful if language model files are too large. We can do this using `IRSTLM `__: + * Install all the IRSTLM build dependencies with ``sudo apt-get install build-essential automake autotools-dev autoconf libtool`` + * Download and extract the `IRSTLM source code `__. + * Follow the instructions in the README to install IRSTLM. Basically, run ``sh regenerate-makefiles.sh --force && ./configure && make && sudo make install`` in the IRSTLM folder. + * If the language model is not in ARPA format, convert it to the ARPA format. To do this, ensure that SphinxBase is installed and run ``sphinx_lm_convert -i LANGUAGE_MODEL_FILE_GOES_HERE -o language-model.lm -ofmt arpa``. + * Prune the model using IRSTLM: run ``prune-lm --threshold=1e-8 t.lm pruned.lm`` to prune with a threshold of 0.00000001. The higher the threshold, the smaller the resulting file. + * Convert the model back into binary format if it was originally not in ARPA format. To do this, ensure that SphinxBase is installed and run ``sphinx_lm_convert -i language-model.lm -o LANGUAGE_MODEL_FILE_GOES_HERE``. +* US English: ``/speech_recognition/pocketsphinx-data/en-US/`` is taken directly from the contents of `PocketSphinx's US English model `__. +* International French: ``/speech_recognition/pocketsphinx-data/fr-FR/``: + * ``/speech_recognition/pocketsphinx-data/fr-FR/language-model.lm.bin`` is ``fr-small.lm.bin`` from the `Sphinx French language model `__. + * ``/speech_recognition/pocketsphinx-data/fr-FR/pronounciation-dictionary.dict`` is ``fr.dict`` from the `Sphinx French language model `__. + * ``/speech_recognition/pocketsphinx-data/fr-FR/acoustic-model/`` contains all of the files extracted from ``cmusphinx-fr-5.2.tar.gz`` in the `Sphinx French acoustic model `__. + * To get better French recognition accuracy at the expense of higher disk space and RAM usage: + 1. Download ``fr.lm.gmp`` from the `Sphinx French language model `__. + 2. Convert from DMP (an obselete Sphinx binary format) to ARPA format: ``sphinx_lm_convert -i fr.lm.gmp -o french.lm.bin``. + 3. Replace ``/speech_recognition/pocketsphinx-data/fr-FR/language-model.lm.bin`` with ``french.lm.bin`` created in the previous step. +* Mandarin Chinese: ``/speech_recognition/pocketsphinx-data/zh-CN/``: + * ``/speech_recognition/pocketsphinx-data/zh-CN/language-model.lm.bin`` is generated as follows: + 1. Download ``zh_broadcastnews_64000_utf8.DMP`` from the `Sphinx Mandarin language model `__. + 2. Convert from DMP (an obselete Sphinx binary format) to ARPA format: ``sphinx_lm_convert -i zh_broadcastnews_64000_utf8.DMP -o chinese.lm -ofmt arpa``. + 3. Prune with a threshold of 0.00000004 using ``prune-lm --threshold=4e-8 chinese.lm chinese.lm``. + 4. Convert from ARPA format to Sphinx binary format: ``sphinx_lm_convert -i chinese.lm -o chinese.lm.bin``. + 5. Replace ``/speech_recognition/pocketsphinx-data/zh-CN/language-model.lm.bin`` with ``chinese.lm.bin`` created in the previous step. + * ``/speech_recognition/pocketsphinx-data/zh-CN/pronounciation-dictionary.dict`` is ``zh_broadcastnews_utf8.dic`` from the `Sphinx Mandarin language model `__. + * ``/speech_recognition/pocketsphinx-data/zh-CN/acoustic-model/`` contains all of the files extracted from ``zh_broadcastnews_16k_ptm256_8000.tar.bz2`` in the `Sphinx Mandarin acoustic model `__. + * To get better Chinese recognition accuracy at the expense of higher disk space and RAM usage, simply skip step 3 when preparing ``zh_broadcastnews_64000_utf8.DMP``. +* Italian: ``/speech_recognition/pocketsphinx-data/it-IT/``: + * ``/speech_recognition/pocketsphinx-data/it-IT/language-model.lm.bin`` is generated as follows: + 1. Download ``cmusphinx-it-5.2.tar.gz`` from the `Sphinx Italian language model `__. + 2. Extract ``/etc/voxforge_it_sphinx.lm`` from ``cmusphinx-it-5.2.tar.gz`` as ``italian.lm``. + 3. Convert from ARPA format to Sphinx binary format: ``sphinx_lm_convert -i italian.lm -o italian.lm.bin``. + 4. Replace ``/speech_recognition/pocketsphinx-data/it-IT/language-model.lm.bin`` with ``italian.lm.bin`` created in the previous step. + * ``/speech_recognition/pocketsphinx-data/it-IT/pronounciation-dictionary.dict`` is ``/etc/voxforge_it_sphinx.dic`` from ``cmusphinx-it-5.2.tar.gz`` (from the `Sphinx Italian language model `__). + * ``/speech_recognition/pocketsphinx-data/it-IT/acoustic-model/`` contains all of the files in ``/model_parameters`` extracted from ``cmusphinx-it-5.2.tar.gz`` (from the `Sphinx Italian language model `__). diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..23dc597b --- /dev/null +++ b/setup.cfg @@ -0,0 +1,25 @@ +[options.extras_require] +dev = + pytest + pytest-randomly + respx + numpy +audio = + PyAudio >= 0.2.11 +pocketsphinx = + pocketsphinx +google-cloud = + google-cloud-speech +whisper-local = + openai-whisper + soundfile +faster-whisper = + faster-whisper +openai = + openai + httpx < 0.28 +groq = + groq + httpx < 0.28 +assemblyai = + requests diff --git a/setup.py b/setup.py index d3231fac..efd463ea 100644 --- a/setup.py +++ b/setup.py @@ -1,45 +1,57 @@ #!/usr/bin/env python3 -import sys, os, stat +import logging +import os +import stat -from setuptools import setup +from setuptools import find_packages, setup from setuptools.command.install import install -from distutils import log import speech_recognition -if sys.version_info < (2, 6): - print("THIS MODULE REQUIRES PYTHON 2.6, 2.7, OR 3.3+. YOU ARE CURRENTLY USING PYTHON {0}".format(sys.version)) - sys.exit(1) +logger = logging.getLogger("SpeechRecognition.setup") + +FILES_TO_MARK_EXECUTABLE = ["flac-linux-x86", "flac-linux-x86_64", "flac-mac", "flac-win32.exe"] + -FILES_TO_MARK_EXECUTABLE = ["flac-linux-i386", "flac-mac", "flac-win32.exe"] class InstallWithExtraSteps(install): def run(self): - install.run(self) # do the original install steps + install.run(self) # do the original install steps # mark the FLAC executables as executable by all users (this fixes occasional issues when file permissions get messed up) for output_path in self.get_outputs(): if os.path.basename(output_path) in FILES_TO_MARK_EXECUTABLE: - log.info("setting executable permissions on {}".format(output_path)) + logger.info("setting executable permissions on %s", output_path) stat_info = os.stat(output_path) - os.chmod(output_path, stat_info.st_mode | stat.S_IEXEC) + OWNER_CAN_READ_EXECUTE = stat.S_IRUSR | stat.S_IXUSR + GROUP_CAN_READ_EXECUTE = stat.S_IRGRP | stat.S_IXGRP + OTHERS_CAN_READ_EXECUTE = stat.S_IROTH | stat.S_IXOTH + os.chmod( + output_path, + stat_info.st_mode + | OWNER_CAN_READ_EXECUTE + | GROUP_CAN_READ_EXECUTE + | OTHERS_CAN_READ_EXECUTE, + ) + setup( - name = "SpeechRecognition", - version = speech_recognition.__version__, - packages = ["speech_recognition"], - include_package_data = True, - cmdclass = {"install": InstallWithExtraSteps}, + name="SpeechRecognition", + version=speech_recognition.__version__, + packages=find_packages(exclude=["tests.*", "test"]), + include_package_data=True, + cmdclass={"install": InstallWithExtraSteps}, # PyPI metadata - author = speech_recognition.__author__, - author_email = "azhang9@gmail.com", - description = speech_recognition.__doc__, - long_description = open("README.rst").read(), - license = speech_recognition.__license__, - keywords = "speech recognition google wit ibm att", - url = "https://github.com/Uberi/speech_recognition#readme", - classifiers = [ + author=speech_recognition.__author__, + author_email="azhang9@gmail.com", + description=speech_recognition.__doc__, + long_description=open("README.rst").read(), + long_description_content_type="text/x-rst", + license=speech_recognition.__license__, + keywords="speech recognition voice sphinx google wit bing api houndify ibm snowboy", + url="https://github.com/Uberi/speech_recognition#readme", + classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Natural Language :: English", @@ -49,14 +61,18 @@ def run(self): "Operating System :: MacOS :: MacOS X", "Operating System :: Other OS", "Programming Language :: Python", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 2.6", - "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.3", - "Programming Language :: Python :: 3.4", - "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Multimedia :: Sound/Audio :: Speech", ], + python_requires=">=3.9", + install_requires=[ + "typing-extensions", + "standard-aifc; python_version>='3.13'", + "audioop-lts; python_version>='3.13'", + ], ) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 77070fc9..dd4143fc 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1,26 +1,43 @@ #!/usr/bin/env python3 -"""Library for performing speech recognition with support for Google Speech Recognition, Wit.ai, IBM Speech to Text, and AT&T Speech to Text.""" +"""Library for performing speech recognition, with support for several engines and APIs, online and offline.""" -__author__ = "Anthony Zhang (Uberi)" -__version__ = "3.2.0" -__license__ = "BSD" +from __future__ import annotations -import io, os, subprocess, wave, base64 -import math, audioop, collections, threading -import platform, stat +import aifc +import audioop +import base64 +import collections +import hashlib +import hmac +import io import json +import math +import os +import subprocess +import sys +import tempfile +import threading +import time +import uuid +import wave +from urllib.error import HTTPError, URLError +from urllib.parse import urlencode +from urllib.request import Request, urlopen + +from .audio import AudioData, get_flac_converter +from .exceptions import ( + RequestError, + TranscriptionFailed, + TranscriptionNotReady, + UnknownValueError, + WaitTimeoutError, +) -try: # try to use python2 module - from urllib2 import Request, urlopen, URLError, HTTPError -except ImportError: # otherwise, use python3 module - from urllib.request import Request, urlopen - from urllib.error import URLError, HTTPError +__author__ = "Anthony Zhang (Uberi)" +__version__ = "3.14.1" +__license__ = "BSD" -# define exceptions -class WaitTimeoutError(Exception): pass -class RequestError(Exception): pass -class UnknownValueError(Exception): pass class AudioSource(object): def __init__(self): @@ -32,247 +49,302 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): raise NotImplementedError("this is an abstract class") -try: - import pyaudio - class Microphone(AudioSource): - """ - This is available if PyAudio is available, and is undefined otherwise. - Creates a new ``Microphone`` instance, which represents a physical microphone on the computer. Subclass of ``AudioSource``. +class Microphone(AudioSource): + """ + Creates a new ``Microphone`` instance, which represents a physical microphone on the computer. Subclass of ``AudioSource``. - If ``device_index`` is unspecified or ``None``, the default microphone is used as the audio source. Otherwise, ``device_index`` should be the index of the device to use for audio input. + This will throw an ``AttributeError`` if you don't have PyAudio (0.2.11 or later) installed. - A device index is an integer between 0 and ``pyaudio.get_device_count() - 1`` (assume we have used ``import pyaudio`` beforehand) inclusive. It represents an audio device such as a microphone or speaker. See the `PyAudio documentation `__ for more details. + If ``device_index`` is unspecified or ``None``, the default microphone is used as the audio source. Otherwise, ``device_index`` should be the index of the device to use for audio input. - The microphone audio is recorded in chunks of ``chunk_size`` samples, at a rate of ``sample_rate`` samples per second (Hertz). + A device index is an integer between 0 and ``pyaudio.get_device_count() - 1`` (assume we have used ``import pyaudio`` beforehand) inclusive. It represents an audio device such as a microphone or speaker. See the `PyAudio documentation `__ for more details. - Higher ``sample_rate`` values result in better audio quality, but also more bandwidth (and therefore, slower recognition). Additionally, some machines, such as some Raspberry Pi models, can't keep up if this value is too high. + The microphone audio is recorded in chunks of ``chunk_size`` samples, at a rate of ``sample_rate`` samples per second (Hertz). If not specified, the value of ``sample_rate`` is determined automatically from the system's microphone settings. - Higher ``chunk_size`` values help avoid triggering on rapidly changing ambient noise, but also makes detection less sensitive. This value, generally, should be left at its default. - """ - def __init__(self, device_index = None, sample_rate = 16000, chunk_size = 1024): - assert device_index is None or isinstance(device_index, int), "Device index must be None or an integer" - if device_index is not None: # ensure device index is in range - audio = pyaudio.PyAudio(); count = audio.get_device_count(); audio.terminate() # obtain device count - assert 0 <= device_index < count, "Device index out of range" - assert isinstance(sample_rate, int) and sample_rate > 0, "Sample rate must be a positive integer" - assert isinstance(chunk_size, int) and chunk_size > 0, "Chunk size must be a positive integer" - self.device_index = device_index - self.format = pyaudio.paInt16 # 16-bit int sampling - self.SAMPLE_WIDTH = pyaudio.get_sample_size(self.format) # size of each sample - self.SAMPLE_RATE = sample_rate # sampling rate in Hertz - self.CHUNK = chunk_size # number of frames stored in each buffer + Higher ``sample_rate`` values result in better audio quality, but also more bandwidth (and therefore, slower recognition). Additionally, some CPUs, such as those in older Raspberry Pi models, can't keep up if this value is too high. - self.audio = None - self.stream = None + Higher ``chunk_size`` values help avoid triggering on rapidly changing ambient noise, but also makes detection less sensitive. This value, generally, should be left at its default. + """ + def __init__(self, device_index=None, sample_rate=None, chunk_size=1024): + assert device_index is None or isinstance(device_index, int), "Device index must be None or an integer" + assert sample_rate is None or (isinstance(sample_rate, int) and sample_rate > 0), "Sample rate must be None or a positive integer" + assert isinstance(chunk_size, int) and chunk_size > 0, "Chunk size must be a positive integer" + + # set up PyAudio + self.pyaudio_module = self.get_pyaudio() + audio = self.pyaudio_module.PyAudio() + try: + count = audio.get_device_count() # obtain device count + if device_index is not None: # ensure device index is in range + assert 0 <= device_index < count, "Device index out of range ({} devices available; device index should be between 0 and {} inclusive)".format(count, count - 1) + if sample_rate is None: # automatically set the sample rate to the hardware's default sample rate if not specified + device_info = audio.get_device_info_by_index(device_index) if device_index is not None else audio.get_default_input_device_info() + assert isinstance(device_info.get("defaultSampleRate"), (float, int)) and device_info["defaultSampleRate"] > 0, "Invalid device info returned from PyAudio: {}".format(device_info) + sample_rate = int(device_info["defaultSampleRate"]) + finally: + audio.terminate() + + self.device_index = device_index + self.format = self.pyaudio_module.paInt16 # 16-bit int sampling + self.SAMPLE_WIDTH = self.pyaudio_module.get_sample_size(self.format) # size of each sample + self.SAMPLE_RATE = sample_rate # sampling rate in Hertz + self.CHUNK = chunk_size # number of frames stored in each buffer + + self.audio = None + self.stream = None + + @staticmethod + def get_pyaudio(): + """ + Imports the pyaudio module and checks its version. Throws exceptions if pyaudio can't be found or a wrong version is installed + """ + try: + import pyaudio + except ImportError: + raise AttributeError("Could not find PyAudio; check installation") + return pyaudio - @staticmethod - def list_microphone_names(): - """ - Returns a list of the names of all available microphones. For microphones where the name can't be retrieved, the list entry contains ``None`` instead. + @staticmethod + def list_microphone_names(): + """ + Returns a list of the names of all available microphones. For microphones where the name can't be retrieved, the list entry contains ``None`` instead. - The index of each microphone's name is the same as its device index when creating a ``Microphone`` instance - indices in this list can be used as values of ``device_index``. - """ - audio = pyaudio.PyAudio() + The index of each microphone's name in the returned list is the same as its device index when creating a ``Microphone`` instance - if you want to use the microphone at index 3 in the returned list, use ``Microphone(device_index=3)``. + """ + audio = Microphone.get_pyaudio().PyAudio() + try: result = [] for i in range(audio.get_device_count()): device_info = audio.get_device_info_by_index(i) result.append(device_info.get("name")) + finally: audio.terminate() - return result + return result - def __enter__(self): - assert self.stream is None, "This audio source is already inside a context manager" - self.audio = pyaudio.PyAudio() - self.stream = self.audio.open( - input_device_index = self.device_index, channels = 1, - format = self.format, rate = self.SAMPLE_RATE, frames_per_buffer = self.CHUNK, - input = True, # stream is an input stream + @staticmethod + def list_working_microphones(): + """ + Returns a dictionary mapping device indices to microphone names, for microphones that are currently hearing sounds. When using this function, ensure that your microphone is unmuted and make some noise at it to ensure it will be detected as working. + + Each key in the returned dictionary can be passed to the ``Microphone`` constructor to use that microphone. For example, if the return value is ``{3: "HDA Intel PCH: ALC3232 Analog (hw:1,0)"}``, you can do ``Microphone(device_index=3)`` to use that microphone. + """ + pyaudio_module = Microphone.get_pyaudio() + audio = pyaudio_module.PyAudio() + try: + result = {} + for device_index in range(audio.get_device_count()): + device_info = audio.get_device_info_by_index(device_index) + device_name = device_info.get("name") + assert isinstance(device_info.get("defaultSampleRate"), (float, int)) and device_info["defaultSampleRate"] > 0, "Invalid device info returned from PyAudio: {}".format(device_info) + try: + # read audio + pyaudio_stream = audio.open( + input_device_index=device_index, channels=1, format=pyaudio_module.paInt16, + rate=int(device_info["defaultSampleRate"]), input=True + ) + try: + buffer = pyaudio_stream.read(1024) + if not pyaudio_stream.is_stopped(): pyaudio_stream.stop_stream() + finally: + pyaudio_stream.close() + except Exception: + continue + + # compute RMS of debiased audio + energy = -audioop.rms(buffer, 2) + energy_bytes = bytes([energy & 0xFF, (energy >> 8) & 0xFF]) + debiased_energy = audioop.rms(audioop.add(buffer, energy_bytes * (len(buffer) // 2), 2), 2) + + if debiased_energy > 30: # probably actually audio + result[device_index] = device_name + finally: + audio.terminate() + return result + + def __enter__(self): + assert self.stream is None, "This audio source is already inside a context manager" + self.audio = self.pyaudio_module.PyAudio() + try: + self.stream = Microphone.MicrophoneStream( + self.audio.open( + input_device_index=self.device_index, channels=1, format=self.format, + rate=self.SAMPLE_RATE, frames_per_buffer=self.CHUNK, input=True, + ) ) - return self + except Exception: + self.audio.terminate() + return self - def __exit__(self, exc_type, exc_value, traceback): - if not self.stream.is_stopped(): - self.stream.stop_stream() + def __exit__(self, exc_type, exc_value, traceback): + try: self.stream.close() + finally: self.stream = None self.audio.terminate() -except ImportError: - pass -class WavFile(AudioSource): + class MicrophoneStream(object): + def __init__(self, pyaudio_stream): + self.pyaudio_stream = pyaudio_stream + + def read(self, size): + return self.pyaudio_stream.read(size, exception_on_overflow=False) + + def close(self): + try: + # sometimes, if the stream isn't stopped, closing the stream throws an exception + if not self.pyaudio_stream.is_stopped(): + self.pyaudio_stream.stop_stream() + finally: + self.pyaudio_stream.close() + + +class AudioFile(AudioSource): """ - Creates a new ``WavFile`` instance given a WAV audio file `filename_or_fileobject`. Subclass of ``AudioSource``. + Creates a new ``AudioFile`` instance given a WAV/AIFF/FLAC audio file ``filename_or_fileobject``. Subclass of ``AudioSource``. + + If ``filename_or_fileobject`` is a string, then it is interpreted as a path to an audio file on the filesystem. Otherwise, ``filename_or_fileobject`` should be a file-like object such as ``io.BytesIO`` or similar. + + Note that functions that read from the audio (such as ``recognizer_instance.record`` or ``recognizer_instance.listen``) will move ahead in the stream. For example, if you execute ``recognizer_instance.record(audiofile_instance, duration=10)`` twice, the first time it will return the first 10 seconds of audio, and the second time it will return the 10 seconds of audio right after that. This is always reset to the beginning when entering an ``AudioFile`` context. - If ``filename_or_fileobject`` is a string, then it is interpreted as a path to a WAV audio file (mono or stereo) on the filesystem. Otherwise, ``filename_or_fileobject`` should be a file-like object such as ``io.BytesIO`` or similar. + WAV files must be in PCM/LPCM format; WAVE_FORMAT_EXTENSIBLE and compressed WAV are not supported and may result in undefined behaviour. - Note that the WAV file must be in PCM/LPCM format; WAVE_FORMAT_EXTENSIBLE and compressed WAV are not supported and may result in undefined behaviour. + Both AIFF and AIFF-C (compressed AIFF) formats are supported. + + FLAC files must be in native FLAC format; OGG-FLAC is not supported and may result in undefined behaviour. """ def __init__(self, filename_or_fileobject): - if isinstance(filename_or_fileobject, str): - self.filename = filename_or_fileobject - else: - assert filename_or_fileobject.read, "Given WAV file must be a filename string or a file-like object" - self.filename = None - self.wav_file = filename_or_fileobject + assert isinstance(filename_or_fileobject, (type(""), type(u""))) or hasattr(filename_or_fileobject, "read"), "Given audio file must be a filename string or a file-like object" + self.filename_or_fileobject = filename_or_fileobject self.stream = None self.DURATION = None + self.audio_reader = None + self.little_endian = False + self.SAMPLE_RATE = None + self.CHUNK = None + self.FRAME_COUNT = None + def __enter__(self): assert self.stream is None, "This audio source is already inside a context manager" - if self.filename is not None: self.wav_file = open(self.filename, "rb") - self.wav_reader = wave.open(self.wav_file, "rb") - assert 1 <= self.wav_reader.getnchannels() <= 2, "Audio must be mono or stereo" - self.SAMPLE_WIDTH = self.wav_reader.getsampwidth() - self.SAMPLE_RATE = self.wav_reader.getframerate() + try: + # attempt to read the file as WAV + self.audio_reader = wave.open(self.filename_or_fileobject, "rb") + self.little_endian = True # RIFF WAV is a little-endian format (most ``audioop`` operations assume that the frames are stored in little-endian form) + except (wave.Error, EOFError): + try: + # attempt to read the file as AIFF + self.audio_reader = aifc.open(self.filename_or_fileobject, "rb") + self.little_endian = False # AIFF is a big-endian format + except (aifc.Error, EOFError): + # attempt to read the file as FLAC + if hasattr(self.filename_or_fileobject, "read"): + flac_data = self.filename_or_fileobject.read() + else: + with open(self.filename_or_fileobject, "rb") as f: flac_data = f.read() + + # run the FLAC converter with the FLAC data to get the AIFF data + flac_converter = get_flac_converter() + if os.name == "nt": # on Windows, specify that the process is to be started without showing a console window + startup_info = subprocess.STARTUPINFO() + startup_info.dwFlags |= subprocess.STARTF_USESHOWWINDOW # specify that the wShowWindow field of `startup_info` contains a value + startup_info.wShowWindow = subprocess.SW_HIDE # specify that the console window should be hidden + else: + startup_info = None # default startupinfo + process = subprocess.Popen([ + flac_converter, + "--stdout", "--totally-silent", # put the resulting AIFF file in stdout, and make sure it's not mixed with any program output + "--decode", "--force-aiff-format", # decode the FLAC file into an AIFF file + "-", # the input FLAC file contents will be given in stdin + ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, startupinfo=startup_info) + aiff_data, _ = process.communicate(flac_data) + aiff_file = io.BytesIO(aiff_data) + try: + self.audio_reader = aifc.open(aiff_file, "rb") + except (aifc.Error, EOFError): + raise ValueError("Audio file could not be read as PCM WAV, AIFF/AIFF-C, or Native FLAC; check if file is corrupted or in another format") + self.little_endian = False # AIFF is a big-endian format + assert 1 <= self.audio_reader.getnchannels() <= 2, "Audio must be mono or stereo" + self.SAMPLE_WIDTH = self.audio_reader.getsampwidth() + + # 24-bit audio needs some special handling for old Python versions (workaround for https://bugs.python.org/issue12866) + samples_24_bit_pretending_to_be_32_bit = False + if self.SAMPLE_WIDTH == 3: # 24-bit audio + try: audioop.bias(b"", self.SAMPLE_WIDTH, 0) # test whether this sample width is supported (for example, ``audioop`` in Python 3.3 and below don't support sample width 3, while Python 3.4+ do) + except audioop.error: # this version of audioop doesn't support 24-bit audio (probably Python 3.3 or less) + samples_24_bit_pretending_to_be_32_bit = True # while the ``AudioFile`` instance will outwardly appear to be 32-bit, it will actually internally be 24-bit + self.SAMPLE_WIDTH = 4 # the ``AudioFile`` instance should present itself as a 32-bit stream now, since we'll be converting into 32-bit on the fly when reading + + self.SAMPLE_RATE = self.audio_reader.getframerate() self.CHUNK = 4096 - self.FRAME_COUNT = self.wav_reader.getnframes() + self.FRAME_COUNT = self.audio_reader.getnframes() self.DURATION = self.FRAME_COUNT / float(self.SAMPLE_RATE) - self.stream = WavFile.WavStream(self.wav_reader) + self.stream = AudioFile.AudioFileStream(self.audio_reader, self.little_endian, samples_24_bit_pretending_to_be_32_bit) return self def __exit__(self, exc_type, exc_value, traceback): - if self.filename: self.wav_file.close() + if not hasattr(self.filename_or_fileobject, "read"): # only close the file if it was opened by this class in the first place (if the file was originally given as a path) + self.audio_reader.close() self.stream = None self.DURATION = None - class WavStream(object): - def __init__(self, wav_reader): - self.wav_reader = wav_reader - - def read(self, size = -1): - buffer = self.wav_reader.readframes(self.wav_reader.getnframes() if size == -1 else size) - if isinstance(buffer, str) and str is not bytes: buffer = b"" # workaround for https://bugs.python.org/issue24608, unfortunately only fixes the issue for little-endian systems - if self.wav_reader.getnchannels() != 1: # stereo audio - buffer = audioop.tomono(buffer, self.wav_reader.getsampwidth(), 1, 1) # convert stereo audio data to mono + class AudioFileStream(object): + def __init__(self, audio_reader, little_endian, samples_24_bit_pretending_to_be_32_bit): + self.audio_reader = audio_reader # an audio file object (e.g., a `wave.Wave_read` instance) + self.little_endian = little_endian # whether the audio data is little-endian (when working with big-endian things, we'll have to convert it to little-endian before we process it) + self.samples_24_bit_pretending_to_be_32_bit = samples_24_bit_pretending_to_be_32_bit # this is true if the audio is 24-bit audio, but 24-bit audio isn't supported, so we have to pretend that this is 32-bit audio and convert it on the fly + + def read(self, size=-1): + buffer = self.audio_reader.readframes(self.audio_reader.getnframes() if size == -1 else size) + if not isinstance(buffer, bytes): buffer = b"" # workaround for https://bugs.python.org/issue24608 + + sample_width = self.audio_reader.getsampwidth() + if not self.little_endian: # big endian format, convert to little endian on the fly + if hasattr(audioop, "byteswap"): # ``audioop.byteswap`` was only added in Python 3.4 (incidentally, that also means that we don't need to worry about 24-bit audio being unsupported, since Python 3.4+ always has that functionality) + buffer = audioop.byteswap(buffer, sample_width) + else: # manually reverse the bytes of each sample, which is slower but works well enough as a fallback + buffer = buffer[sample_width - 1::-1] + b"".join(buffer[i + sample_width:i:-1] for i in range(sample_width - 1, len(buffer), sample_width)) + + # workaround for https://bugs.python.org/issue12866 + if self.samples_24_bit_pretending_to_be_32_bit: # we need to convert samples from 24-bit to 32-bit before we can process them with ``audioop`` functions + buffer = b"".join(b"\x00" + buffer[i:i + sample_width] for i in range(0, len(buffer), sample_width)) # since we're in little endian, we prepend a zero byte to each 24-bit sample to get a 32-bit sample + sample_width = 4 # make sure we thread the buffer as 32-bit audio now, after converting it from 24-bit audio + if self.audio_reader.getnchannels() != 1: # stereo audio + buffer = audioop.tomono(buffer, sample_width, 1, 1) # convert stereo audio data to mono return buffer -class AudioData(object): - def __init__(self, frame_data, sample_rate, sample_width): - assert sample_rate > 0, "Sample rate must be a positive integer" - assert sample_width % 1 == 0 and 2 <= sample_width <= 4, "Sample width must be 2, 3, or 4" - self.frame_data = frame_data - self.sample_rate = sample_rate - self.sample_width = int(sample_width) - - def get_raw_data(self, convert_rate = None, convert_width = None): - """ - Returns a byte string representing the raw frame data for the audio represented by the ``AudioData`` instance. - - If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. - - If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. - - Writing these bytes directly to a file results in a valid `RAW/PCM audio file `__. - """ - assert convert_rate is None or convert_rate > 0, "Sample rate to convert to must be a positive integer" - assert convert_width is None or (convert_width % 1 == 0 and 2 <= convert_width <= 4), "Sample width to convert to must be 2, 3, or 4" - - raw_data = self.frame_data - - # resample audio at the desired rate if specified - if convert_rate is not None and self.sample_rate != convert_rate: - raw_data, _ = audioop.ratecv(raw_data, self.sample_width, 1, self.sample_rate, convert_rate, None) - - # convert samples to desired byte format if specified - if convert_width is not None and self.sample_width != convert_width: - raw_data = audioop.lin2lin(raw_data, self.sample_width, convert_width) - - return raw_data - - def get_wav_data(self, convert_rate = None, convert_width = None): - """ - Returns a byte string representing the contents of a WAV file containing the audio represented by the ``AudioData`` instance. - - If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. - - If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. - - Writing these bytes directly to a file results in a valid `WAV file `__. - """ - raw_data = self.get_raw_data(convert_rate, convert_width) - sample_rate = self.sample_rate if convert_rate is None else convert_rate - sample_width = self.sample_width if convert_width is None else convert_width - - # generate the WAV file contents - with io.BytesIO() as wav_file: - wav_writer = wave.open(wav_file, "wb") - try: # note that we can't use context manager due to Python 2 not supporting it - wav_writer.setframerate(sample_rate) - wav_writer.setsampwidth(sample_width) - wav_writer.setnchannels(1) - wav_writer.writeframes(raw_data) - finally: # make sure resources are cleaned up - wav_writer.close() - wav_data = wav_file.getvalue() - return wav_data - - def get_flac_data(self, convert_rate = None, convert_width = None): - """ - Returns a byte string representing the contents of a FLAC file containing the audio represented by the ``AudioData`` instance. - - If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. - - If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. - - Writing these bytes directly to a file results in a valid `FLAC file `__. - """ - wav_data = self.get_wav_data(convert_rate, convert_width) - - # determine which converter executable to use - system = platform.system() - path = os.path.dirname(os.path.abspath(__file__)) # directory of the current module file, where all the FLAC bundled binaries are stored - flac_converter = shutil_which("flac") # check for installed version first - if flac_converter is None: # flac utility is not installed - if system == "Windows" and platform.machine() in ["i386", "x86", "x86_64", "AMD64"]: # Windows NT, use the bundled FLAC conversion utility - flac_converter = os.path.join(path, "flac-win32.exe") - elif system == "Linux" and platform.machine() in ["i386", "x86", "x86_64", "AMD64"]: - flac_converter = os.path.join(path, "flac-linux-i386") - elif system == "Darwin" and platform.machine() in ["i386", "x86", "x86_64", "AMD64"]: - flac_converter = os.path.join(path, "flac-mac") - else: - raise OSError("FLAC conversion utility not available - consider installing the FLAC command line application using `brew install flac` or your operating system's equivalent") - - # mark FLAC converter as executable - try: - stat_info = os.stat(flac_converter) - os.chmod(flac_converter, stat_info.st_mode | stat.S_IEXEC) - except OSError: pass - - # run the FLAC converter with the WAV data to get the FLAC data - process = subprocess.Popen("\"{0}\" --stdout --totally-silent --best -".format(flac_converter), stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) - flac_data, stderr = process.communicate(wav_data) - return flac_data class Recognizer(AudioSource): def __init__(self): """ Creates a new ``Recognizer`` instance, which represents a collection of speech recognition functionality. """ - self.energy_threshold = 300 # minimum audio energy to consider for recording + self.energy_threshold = 300 # minimum audio energy to consider for recording self.dynamic_energy_threshold = True self.dynamic_energy_adjustment_damping = 0.15 self.dynamic_energy_ratio = 1.5 - self.pause_threshold = 0.8 # seconds of non-speaking audio before a phrase is considered complete - self.phrase_threshold = 0.3 # minimum seconds of speaking audio before we consider the speaking audio a phrase - values below this are ignored (for filtering out clicks and pops) - self.non_speaking_duration = 0.5 # seconds of non-speaking audio to keep on both sides of the recording + self.pause_threshold = 0.8 # seconds of non-speaking audio before a phrase is considered complete + self.operation_timeout = None # seconds after an internal operation (e.g., an API request) starts before it times out, or ``None`` for no timeout + + self.phrase_threshold = 0.3 # minimum seconds of speaking audio before we consider the speaking audio a phrase - values below this are ignored (for filtering out clicks and pops) + self.non_speaking_duration = 0.5 # seconds of non-speaking audio to keep on both sides of the recording - def record(self, source, duration = None, offset = None): + def record(self, source, duration=None, offset=None): """ Records up to ``duration`` seconds of audio from ``source`` (an ``AudioSource`` instance) starting at ``offset`` (or at the beginning if not specified) into an ``AudioData`` instance, which it returns. If ``duration`` is not specified, then it will record until there is no more audio input. """ assert isinstance(source, AudioSource), "Source must be an audio source" - assert source.stream is not None, "Audio source must be opened before recording - see documentation for `AudioSource`" + assert source.stream is not None, "Audio source must be entered before recording, see documentation for ``AudioSource``; are you using ``source`` outside of a ``with`` statement?" frames = io.BytesIO() seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE elapsed_time = 0 offset_time = 0 offset_reached = False - while True: # loop for the total number of chunks needed + while True: # loop for the total number of chunks needed if offset and not offset_reached: offset_time += seconds_per_buffer if offset_time > offset: @@ -291,7 +363,7 @@ def record(self, source, duration = None, offset = None): frames.close() return AudioData(frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH) - def adjust_for_ambient_noise(self, source, duration = 1): + def adjust_for_ambient_noise(self, source, duration=1): """ Adjusts the energy threshold dynamically using audio from ``source`` (an ``AudioSource`` instance) to account for ambient noise. @@ -300,7 +372,7 @@ def adjust_for_ambient_noise(self, source, duration = 1): The ``duration`` parameter is the maximum number of seconds that it will dynamically adjust the threshold for before returning. This value should be at least 0.5 in order to get a representative sample of the ambient noise. """ assert isinstance(source, AudioSource), "Source must be an audio source" - assert source.stream is not None, "Audio source must be opened before recording - see documentation for `AudioSource`" + assert source.stream is not None, "Audio source must be entered before adjusting, see documentation for ``AudioSource``; are you using ``source`` outside of a ``with`` statement?" assert self.pause_threshold >= self.non_speaking_duration >= 0 seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE @@ -311,368 +383,960 @@ def adjust_for_ambient_noise(self, source, duration = 1): elapsed_time += seconds_per_buffer if elapsed_time > duration: break buffer = source.stream.read(source.CHUNK) - energy = audioop.rms(buffer, source.SAMPLE_WIDTH) # energy of the audio signal + energy = audioop.rms(buffer, source.SAMPLE_WIDTH) # energy of the audio signal - # dynamically adjust the energy threshold using assymmetric weighted average - damping = self.dynamic_energy_adjustment_damping ** seconds_per_buffer # account for different chunk sizes and rates + # dynamically adjust the energy threshold using asymmetric weighted average + damping = self.dynamic_energy_adjustment_damping ** seconds_per_buffer # account for different chunk sizes and rates target_energy = energy * self.dynamic_energy_ratio self.energy_threshold = self.energy_threshold * damping + target_energy * (1 - damping) - def listen(self, source, timeout = None): + def snowboy_wait_for_hot_word(self, snowboy_location, snowboy_hot_word_files, source, timeout=None): + # load snowboy library (NOT THREAD SAFE) + sys.path.append(snowboy_location) + import snowboydetect + sys.path.pop() + + detector = snowboydetect.SnowboyDetect( + resource_filename=os.path.join(snowboy_location, "resources", "common.res").encode(), + model_str=",".join(snowboy_hot_word_files).encode() + ) + detector.SetAudioGain(1.0) + detector.SetSensitivity(",".join(["0.4"] * len(snowboy_hot_word_files)).encode()) + snowboy_sample_rate = detector.SampleRate() + + elapsed_time = 0 + seconds_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE + resampling_state = None + + # buffers capable of holding 5 seconds of original audio + five_seconds_buffer_count = int(math.ceil(5 / seconds_per_buffer)) + # buffers capable of holding 0.5 seconds of resampled audio + half_second_buffer_count = int(math.ceil(0.5 / seconds_per_buffer)) + frames = collections.deque(maxlen=five_seconds_buffer_count) + resampled_frames = collections.deque(maxlen=half_second_buffer_count) + # snowboy check interval + check_interval = 0.05 + last_check = time.time() + while True: + elapsed_time += seconds_per_buffer + if timeout and elapsed_time > timeout: + raise WaitTimeoutError("listening timed out while waiting for hotword to be said") + + buffer = source.stream.read(source.CHUNK) + if len(buffer) == 0: break # reached end of the stream + frames.append(buffer) + + # resample audio to the required sample rate + resampled_buffer, resampling_state = audioop.ratecv(buffer, source.SAMPLE_WIDTH, 1, source.SAMPLE_RATE, snowboy_sample_rate, resampling_state) + resampled_frames.append(resampled_buffer) + if time.time() - last_check > check_interval: + # run Snowboy on the resampled audio + snowboy_result = detector.RunDetection(b"".join(resampled_frames)) + assert snowboy_result != -1, "Error initializing streams or reading audio data" + if snowboy_result > 0: break # wake word found + resampled_frames.clear() + last_check = time.time() + + return b"".join(frames), elapsed_time + + def listen(self, source, timeout=None, phrase_time_limit=None, snowboy_configuration=None, stream=False): """ Records a single phrase from ``source`` (an ``AudioSource`` instance) into an ``AudioData`` instance, which it returns. + If the ``stream`` keyword argument is ``True``, the ``listen()`` method will yield ``AudioData`` instances representing chunks of audio data as they are detected. The first yielded ``AudioData`` instance represents the first buffer of the phrase, and the last yielded ``AudioData`` instance represents the last buffer of the phrase. If ``stream`` is ``False``, the method will return a single ``AudioData`` instance representing the entire phrase. + This is done by waiting until the audio has an energy above ``recognizer_instance.energy_threshold`` (the user has started speaking), and then recording until it encounters ``recognizer_instance.pause_threshold`` seconds of non-speaking or there is no more audio input. The ending silence is not included. - The ``timeout`` parameter is the maximum number of seconds that it will wait for a phrase to start before giving up and throwing an ``speech_recognition.WaitTimeoutError`` exception. If ``timeout`` is ``None``, it will wait indefinitely. + The ``timeout`` parameter is the maximum number of seconds that this will wait for a phrase to start before giving up and throwing an ``speech_recognition.WaitTimeoutError`` exception. If ``timeout`` is ``None``, there will be no wait timeout. + + The ``phrase_time_limit`` parameter is the maximum number of seconds that this will allow a phrase to continue before stopping and returning the part of the phrase processed before the time limit was reached. The resulting audio will be the phrase cut off at the time limit. If ``phrase_timeout`` is ``None``, there will be no phrase time limit. + + The ``snowboy_configuration`` parameter allows integration with `Snowboy `__, an offline, high-accuracy, power-efficient hotword recognition engine. When used, this function will pause until Snowboy detects a hotword, after which it will unpause. This parameter should either be ``None`` to turn off Snowboy support, or a tuple of the form ``(SNOWBOY_LOCATION, LIST_OF_HOT_WORD_FILES)``, where ``SNOWBOY_LOCATION`` is the path to the Snowboy root directory, and ``LIST_OF_HOT_WORD_FILES`` is a list of paths to Snowboy hotword configuration files (`*.pmdl` or `*.umdl` format). + + This operation will always complete within ``timeout + phrase_timeout`` seconds if both are numbers, either by returning the audio data, or by raising a ``speech_recognition.WaitTimeoutError`` exception. """ + result = self._listen(source, timeout, phrase_time_limit, snowboy_configuration, stream) + if not stream: + for a in result: + return a + return result + + def _listen(self, source, timeout=None, phrase_time_limit=None, snowboy_configuration=None, stream=False): assert isinstance(source, AudioSource), "Source must be an audio source" - assert source.stream is not None, "Audio source must be opened before recording - see documentation for `AudioSource`" + assert source.stream is not None, "Audio source must be entered before listening, see documentation for ``AudioSource``; are you using ``source`` outside of a ``with`` statement?" assert self.pause_threshold >= self.non_speaking_duration >= 0 + if snowboy_configuration is not None: + assert os.path.isfile(os.path.join(snowboy_configuration[0], "snowboydetect.py")), "``snowboy_configuration[0]`` must be a Snowboy root directory containing ``snowboydetect.py``" + for hot_word_file in snowboy_configuration[1]: + assert os.path.isfile(hot_word_file), "``snowboy_configuration[1]`` must be a list of Snowboy hot word configuration files" - seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE - pause_buffer_count = int(math.ceil(self.pause_threshold / seconds_per_buffer)) # number of buffers of non-speaking audio before the phrase is complete - phrase_buffer_count = int(math.ceil(self.phrase_threshold / seconds_per_buffer)) # minimum number of buffers of speaking audio before we consider the speaking audio a phrase - non_speaking_buffer_count = int(math.ceil(self.non_speaking_duration / seconds_per_buffer)) # maximum number of buffers of non-speaking audio to retain before and after + seconds_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE + pause_buffer_count = int(math.ceil(self.pause_threshold / seconds_per_buffer)) # number of buffers of non-speaking audio during a phrase, before the phrase should be considered complete + phrase_buffer_count = int(math.ceil(self.phrase_threshold / seconds_per_buffer)) # minimum number of buffers of speaking audio before we consider the speaking audio a phrase + non_speaking_buffer_count = int(math.ceil(self.non_speaking_duration / seconds_per_buffer)) # maximum number of buffers of non-speaking audio to retain before and after a phrase # read audio input for phrases until there is a phrase that is long enough - elapsed_time = 0 # number of seconds of audio read + elapsed_time = 0 # number of seconds of audio read + buffer = b"" # an empty buffer means that the stream has ended and there is no data left to read while True: frames = collections.deque() - # store audio input until the phrase starts - while True: - elapsed_time += seconds_per_buffer - if timeout and elapsed_time > timeout: # handle timeout if specified - raise WaitTimeoutError("listening timed out") - - buffer = source.stream.read(source.CHUNK) - if len(buffer) == 0: break # reached end of the stream + if snowboy_configuration is None: + # store audio input until the phrase starts + while True: + # handle waiting too long for phrase by raising an exception + elapsed_time += seconds_per_buffer + if timeout and elapsed_time > timeout: + raise WaitTimeoutError("listening timed out while waiting for phrase to start") + + buffer = source.stream.read(source.CHUNK) + if len(buffer) == 0: break # reached end of the stream + frames.append(buffer) + if len(frames) > non_speaking_buffer_count: # ensure we only keep the needed amount of non-speaking buffers + frames.popleft() + + # detect whether speaking has started on audio input + energy = audioop.rms(buffer, source.SAMPLE_WIDTH) # energy of the audio signal + if energy > self.energy_threshold: break + + # dynamically adjust the energy threshold using asymmetric weighted average + if self.dynamic_energy_threshold: + damping = self.dynamic_energy_adjustment_damping ** seconds_per_buffer # account for different chunk sizes and rates + target_energy = energy * self.dynamic_energy_ratio + self.energy_threshold = self.energy_threshold * damping + target_energy * (1 - damping) + else: + # read audio input until the hotword is said + snowboy_location, snowboy_hot_word_files = snowboy_configuration + buffer, delta_time = self.snowboy_wait_for_hot_word(snowboy_location, snowboy_hot_word_files, source, timeout) + elapsed_time += delta_time + if len(buffer) == 0: break # reached end of the stream frames.append(buffer) - if len(frames) > non_speaking_buffer_count: # ensure we only keep the needed amount of non-speaking buffers - frames.popleft() - - # detect whether speaking has started on audio input - energy = audioop.rms(buffer, source.SAMPLE_WIDTH) # energy of the audio signal - if energy > self.energy_threshold: break - - # dynamically adjust the energy threshold using assymmetric weighted average - if self.dynamic_energy_threshold: - damping = self.dynamic_energy_adjustment_damping ** seconds_per_buffer # account for different chunk sizes and rates - target_energy = energy * self.dynamic_energy_ratio - self.energy_threshold = self.energy_threshold * damping + target_energy * (1 - damping) # read audio input until the phrase ends pause_count, phrase_count = 0, 0 + phrase_start_time = elapsed_time + + if stream: + # yield the first buffer of the phrase + yield AudioData(b"".join(frames), source.SAMPLE_RATE, source.SAMPLE_WIDTH) + frames.clear() + while True: + # handle phrase being too long by cutting off the audio elapsed_time += seconds_per_buffer + if phrase_time_limit and elapsed_time - phrase_start_time > phrase_time_limit: + break buffer = source.stream.read(source.CHUNK) - if len(buffer) == 0: break # reached end of the stream + if len(buffer) == 0: break # reached end of the stream frames.append(buffer) phrase_count += 1 # check if speaking has stopped for longer than the pause threshold on the audio input - energy = audioop.rms(buffer, source.SAMPLE_WIDTH) # energy of the audio signal + energy = audioop.rms(buffer, source.SAMPLE_WIDTH) # unit energy of the audio signal within the buffer if energy > self.energy_threshold: pause_count = 0 else: pause_count += 1 - if pause_count > pause_buffer_count: # end of the phrase + if pause_count > pause_buffer_count: # end of the phrase break + # dynamically adjust the energy threshold using asymmetric weighted average + if self.dynamic_energy_threshold: + damping = self.dynamic_energy_adjustment_damping ** seconds_per_buffer # account for different chunk sizes and rates + target_energy = energy * self.dynamic_energy_ratio + self.energy_threshold = self.energy_threshold * damping + target_energy * (1 - damping) + + if stream: + # yield the current chunk of audio data wrapped in AudioData + yield AudioData(buffer, source.SAMPLE_RATE, source.SAMPLE_WIDTH) + # check how long the detected phrase is, and retry listening if the phrase is too short - phrase_count -= pause_count - if phrase_count >= phrase_buffer_count: break # phrase is long enough, stop listening + phrase_count -= pause_count # exclude the buffers for the pause before the phrase + if phrase_count >= phrase_buffer_count or len(buffer) == 0: break # phrase is long enough or we've reached the end of the stream, so stop listening - # obtain frame data - for i in range(pause_count - non_speaking_buffer_count): frames.pop() # remove extra non-speaking frames at the end - frame_data = b"".join(list(frames)) + if stream: + # yield the last buffer of the phrase. + yield AudioData(buffer, source.SAMPLE_RATE, source.SAMPLE_WIDTH) + else: + # obtain frame data + for i in range( + pause_count - non_speaking_buffer_count): frames.pop() # remove extra non-speaking frames at the end - return AudioData(frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH) + frame_data = b"".join(frames) + # yield the entire phrase as a single AudioData instance + yield AudioData(frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH) - def listen_in_background(self, source, callback): + def listen_in_background(self, source, callback, phrase_time_limit=None): """ Spawns a thread to repeatedly record phrases from ``source`` (an ``AudioSource`` instance) into an ``AudioData`` instance and call ``callback`` with that ``AudioData`` instance as soon as each phrase are detected. - Returns a function object that, when called, requests that the background listener thread stop, and waits until it does before returning. The background thread is a daemon and will not stop the program from exiting if there are no other non-daemon threads. + Returns a function object that, when called, requests that the background listener thread stop. The background thread is a daemon and will not stop the program from exiting if there are no other non-daemon threads. The function accepts one parameter, ``wait_for_stop``: if truthy, the function will wait for the background listener to stop before returning, otherwise it will return immediately and the background listener thread might still be running for a second or two afterwards. Additionally, if you are using a truthy value for ``wait_for_stop``, you must call the function from the same thread you originally called ``listen_in_background`` from. - Phrase recognition uses the exact same mechanism as ``recognizer_instance.listen(source)``. + Phrase recognition uses the exact same mechanism as ``recognizer_instance.listen(source)``. The ``phrase_time_limit`` parameter works in the same way as the ``phrase_time_limit`` parameter for ``recognizer_instance.listen(source)``, as well. The ``callback`` parameter is a function that should accept two parameters - the ``recognizer_instance``, and an ``AudioData`` instance representing the captured audio. Note that ``callback`` function will be called from a non-main thread. """ assert isinstance(source, AudioSource), "Source must be an audio source" running = [True] + def threaded_listen(): with source as s: while running[0]: - try: # listen for 1 second, then check again if the stop function has been called - audio = self.listen(s, 1) - except WaitTimeoutError: # listening timed out, just try again + try: # listen for 1 second, then check again if the stop function has been called + audio = self.listen(s, 1, phrase_time_limit) + except WaitTimeoutError: # listening timed out, just try again pass else: if running[0]: callback(self, audio) - def stopper(): + + def stopper(wait_for_stop=True): running[0] = False - listener_thread.join() # block until the background thread is done, which can be up to 1 second + if wait_for_stop: + listener_thread.join() # block until the background thread is done, which can take around 1 second + listener_thread = threading.Thread(target=threaded_listen) listener_thread.daemon = True listener_thread.start() return stopper - def recognize_sphinx(self, audio_data, language = "en-US", show_all = False): + def recognize_wit(self, audio_data, key, show_all=False): """ - Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx. + Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Wit.ai API. - The recognition language is determined by ``language``, an IETF language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. Out of the box, only ``en-US`` is supported. See the "Installing other languages" section in the README for information about additional language packs. + The Wit.ai API key is specified by ``key``. Unfortunately, these are not available without `signing up for an account `__ and creating an app. You will need to add at least one intent to the app before you can see the API key, though the actual intent settings don't matter. - Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Decoder`` object resulting from the recognition. + To get the API key for a Wit.ai app, go to the app's overview page, go to the section titled "Make an API request", and look for something along the lines of ``Authorization: Bearer XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX``; ``XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX`` is the API key. Wit.ai API keys are 32-character uppercase alphanumeric strings. + + The recognition language is configured in the Wit.ai app settings. - Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation. + Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response `__ as a JSON dictionary. + + Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. """ - assert isinstance(audio_data, AudioData), "`audio_data` must be audio data" - assert isinstance(language, str), "`language` must be a string" - - # import the PocketSphinx speech recognition module + assert isinstance(audio_data, AudioData), "Data must be audio data" + assert isinstance(key, str), "``key`` must be a string" + + wav_data = audio_data.get_wav_data( + convert_rate=None if audio_data.sample_rate >= 8000 else 8000, # audio samples must be at least 8 kHz + convert_width=2 # audio samples should be 16-bit + ) + url = "https://api.wit.ai/speech?v=20170307" + request = Request(url, data=wav_data, headers={"Authorization": "Bearer {}".format(key), "Content-Type": "audio/wav"}) try: - from pocketsphinx import pocketsphinx - from sphinxbase import sphinxbase - except ImportError: - raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.") - - language_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)), "pocketsphinx-data", language) - if not os.path.isdir(language_directory): - raise RequestError("missing PocketSphinx language data directory: \"{0}\"".format(language_directory)) - acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model") - if not os.path.isdir(acoustic_parameters_directory): - raise RequestError("missing PocketSphinx language model parameters directory: \"{0}\"".format(acoustic_parameters_directory)) - language_model_file = os.path.join(language_directory, "language-model.lm.bin") - if not os.path.isfile(language_model_file): - raise RequestError("missing PocketSphinx language model file: \"{0}\"".format(language_model_file)) - phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict") - if not os.path.isfile(phoneme_dictionary_file): - raise RequestError("missing PocketSphinx phoneme dictionary file: \"{0}\"".format(phoneme_dictionary_file)) - - # create decoder object - config = pocketsphinx.Decoder.default_config() - config.set_string("-hmm", acoustic_parameters_directory) # set the path of the hidden Markov model (HMM) parameter files - config.set_string("-lm", language_model_file) - config.set_string("-dict", phoneme_dictionary_file) - config.set_string("-logfn", os.devnull) # disable logging (logging causes unwanted output in terminal) - decoder = pocketsphinx.Decoder(config) - - # obtain audio data - raw_data = audio_data.get_raw_data(convert_rate = 16000, convert_width = 2) # the included language models require audio to be 16-bit mono 16 kHz in little-endian format - - # obtain recognition results - decoder.start_utt() # begin utterance processing - decoder.process_raw(raw_data, False, True) # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True) - decoder.end_utt() # stop utterance processing - - if show_all: return decoder + response = urlopen(request, timeout=self.operation_timeout) + except HTTPError as e: + raise RequestError("recognition request failed: {}".format(e.reason)) + except URLError as e: + raise RequestError("recognition connection failed: {}".format(e.reason)) + response_text = response.read().decode("utf-8") + result = json.loads(response_text) # return results - hypothesis = decoder.hyp() - if hypothesis is not None: return hypothesis.hypstr - raise UnknownValueError() # no transcriptions available + if show_all: return result + if "_text" not in result or result["_text"] is None: raise UnknownValueError() + return result["_text"] - def recognize_google(self, audio_data, key = None, language = "en-US", show_all = False): + def recognize_azure(self, audio_data, key, language="en-US", profanity="masked", location="westus", show_all=False): """ - Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Speech Recognition API. + Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Microsoft Azure Speech API. - The Google Speech Recognition API key is specified by ``key``. If not specified, it uses a generic key that works out of the box. This should generally be used for personal or testing purposes only, as it **may be revoked by Google at any time**. + The Microsoft Azure Speech API key is specified by ``key``. Unfortunately, these are not available without `signing up for an account `__ with Microsoft Azure. - To obtain your own API key, simply following the steps on the `API Keys `__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API". + To get the API key, go to the `Microsoft Azure Portal Resources `__ page, go to "All Resources" > "Add" > "See All" > Search "Speech > "Create", and fill in the form to make a "Speech" resource. On the resulting page (which is also accessible from the "All Resources" page in the Azure Portal), go to the "Show Access Keys" page, which will have two API keys, either of which can be used for the `key` parameter. Microsoft Azure Speech API keys are 32-character lowercase hexadecimal strings. - The recognition language is determined by ``language``, an IETF language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. A list of supported language codes can be found `here `__. Basically, language codes can be just the language (``en``), or a language with a dialect (``en-US``). + The recognition language is determined by ``language``, a BCP-47 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language values can be found in the `API documentation `__ under "Interactive and dictation mode". - Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary. + Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response `__ as a JSON dictionary. - Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the key isn't valid, the quota for the key is maxed out, or there is no internet connection. + Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. """ - assert isinstance(audio_data, AudioData), "`audio_data` must be audio data" - assert key is None or isinstance(key, str), "`key` must be `None` or a string" - assert isinstance(language, str), "`language` must be a string" + assert isinstance(audio_data, AudioData), "Data must be audio data" + assert isinstance(key, str), "``key`` must be a string" + # assert isinstance(result_format, str), "``format`` must be a string" # simple|detailed + assert isinstance(language, str), "``language`` must be a string" - flac_data = audio_data.get_flac_data( - convert_rate = None if audio_data.sample_rate >= 8000 else 8000, # audio samples must be at least 8 kHz + result_format = 'detailed' + access_token, expire_time = getattr(self, "azure_cached_access_token", None), getattr(self, "azure_cached_access_token_expiry", None) + allow_caching = True + try: + from time import ( + monotonic, # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+ + ) + except ImportError: + expire_time = None # monotonic time not available, don't cache access tokens + allow_caching = False # don't allow caching, since monotonic time isn't available + if expire_time is None or monotonic() > expire_time: # caching not enabled, first credential request, or the access token from the previous one expired + # get an access token using OAuth + credential_url = "https://" + location + ".api.cognitive.microsoft.com/sts/v1.0/issueToken" + credential_request = Request(credential_url, data=b"", headers={ + "Content-type": "application/x-www-form-urlencoded", + "Content-Length": "0", + "Ocp-Apim-Subscription-Key": key, + }) + + if allow_caching: + start_time = monotonic() + + try: + credential_response = urlopen(credential_request, timeout=60) # credential response can take longer, use longer timeout instead of default one + except HTTPError as e: + raise RequestError("credential request failed: {}".format(e.reason)) + except URLError as e: + raise RequestError("credential connection failed: {}".format(e.reason)) + access_token = credential_response.read().decode("utf-8") + + if allow_caching: + # save the token for the duration it is valid for + self.azure_cached_access_token = access_token + self.azure_cached_access_token_expiry = start_time + 600 # according to https://docs.microsoft.com/en-us/azure/cognitive-services/Speech-Service/rest-apis#authentication, the token expires in exactly 10 minutes + + wav_data = audio_data.get_wav_data( + convert_rate=16000, # audio samples must be 8kHz or 16 kHz + convert_width=2 # audio samples should be 16-bit ) - if key is None: key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw" - url = "http://www.google.com/speech-api/v2/recognize?client=chromium&lang={0}&key={1}".format(language, key) - request = Request(url, data = flac_data, headers = {"Content-Type": "audio/x-flac; rate={0}".format(audio_data.sample_rate)}) - # obtain audio transcription results + url = "https://" + location + ".stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1?{}".format(urlencode({ + "language": language, + "format": result_format, + "profanity": profanity + })) + + if sys.version_info >= (3, 6): # chunked-transfer requests are only supported in the standard library as of Python 3.6+, use it if possible + request = Request(url, data=io.BytesIO(wav_data), headers={ + "Authorization": "Bearer {}".format(access_token), + "Content-type": "audio/wav; codec=\"audio/pcm\"; samplerate=16000", + "Transfer-Encoding": "chunked", + }) + else: # fall back on manually formatting the POST body as a chunked request + ascii_hex_data_length = "{:X}".format(len(wav_data)).encode("utf-8") + chunked_transfer_encoding_data = ascii_hex_data_length + b"\r\n" + wav_data + b"\r\n0\r\n\r\n" + request = Request(url, data=chunked_transfer_encoding_data, headers={ + "Authorization": "Bearer {}".format(access_token), + "Content-type": "audio/wav; codec=\"audio/pcm\"; samplerate=16000", + "Transfer-Encoding": "chunked", + }) + try: - response = urlopen(request) + response = urlopen(request, timeout=self.operation_timeout) except HTTPError as e: - raise RequestError("recognition request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6 + raise RequestError("recognition request failed: {}".format(e.reason)) except URLError as e: - raise RequestError("recognition connection failed: {0}".format(e.reason)) + raise RequestError("recognition connection failed: {}".format(e.reason)) response_text = response.read().decode("utf-8") - - # ignore any blank blocks - actual_result = [] - for line in response_text.split("\n"): - if not line: continue - result = json.loads(line)["result"] - if len(result) != 0: - actual_result = result[0] - break + result = json.loads(response_text) # return results - if show_all: return actual_result - if "alternative" not in actual_result: raise UnknownValueError() - for entry in actual_result["alternative"]: - if "transcript" in entry: - return entry["transcript"] - raise UnknownValueError() # no transcriptions available + if show_all: + return result + if "RecognitionStatus" not in result or result["RecognitionStatus"] != "Success" or "NBest" not in result: + raise UnknownValueError() + return result['NBest'][0]["Display"], result['NBest'][0]["Confidence"] - def recognize_wit(self, audio_data, key, show_all = False): + def recognize_bing(self, audio_data, key, language="en-US", show_all=False): """ - Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Wit.ai API. - - The Wit.ai API key is specified by ``key``. Unfortunately, these are not available without `signing up for an account `__ and creating an app. You will need to add at least one intent (recognizable sentence) before the API key can be accessed, though the actual intent values don't matter. + Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Microsoft Bing Speech API. - To get the API key for a Wit.ai app, go to the app settings, go to the section titled "API Details", and look for "Server Access Token" or "Client Access Token". If the desired field is blank, click on the "Reset token" button on the right of the field. Wit.ai API keys are 32-character uppercase alphanumeric strings. + The Microsoft Bing Speech API key is specified by ``key``. Unfortunately, these are not available without `signing up for an account `__ with Microsoft Azure. - Though Wit.ai is designed to be used with a fixed set of phrases, it still provides services for general-purpose speech recognition. + To get the API key, go to the `Microsoft Azure Portal Resources `__ page, go to "All Resources" > "Add" > "See All" > Search "Bing Speech API > "Create", and fill in the form to make a "Bing Speech API" resource. On the resulting page (which is also accessible from the "All Resources" page in the Azure Portal), go to the "Show Access Keys" page, which will have two API keys, either of which can be used for the `key` parameter. Microsoft Bing Speech API keys are 32-character lowercase hexadecimal strings. - The recognition language is configured in the Wit.ai app settings. + The recognition language is determined by ``language``, a BCP-47 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language values can be found in the `API documentation `__ under "Interactive and dictation mode". - Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response `__ as a JSON dictionary. + Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response `__ as a JSON dictionary. - Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the key isn't valid, the quota for the key is maxed out, or there is no internet connection. + Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. """ assert isinstance(audio_data, AudioData), "Data must be audio data" - assert isinstance(key, str), "`key` must be a string" + assert isinstance(key, str), "``key`` must be a string" + assert isinstance(language, str), "``language`` must be a string" + + access_token, expire_time = getattr(self, "bing_cached_access_token", None), getattr(self, "bing_cached_access_token_expiry", None) + allow_caching = True + try: + from time import ( + monotonic, # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+ + ) + except ImportError: + expire_time = None # monotonic time not available, don't cache access tokens + allow_caching = False # don't allow caching, since monotonic time isn't available + if expire_time is None or monotonic() > expire_time: # caching not enabled, first credential request, or the access token from the previous one expired + # get an access token using OAuth + credential_url = "https://api.cognitive.microsoft.com/sts/v1.0/issueToken" + credential_request = Request(credential_url, data=b"", headers={ + "Content-type": "application/x-www-form-urlencoded", + "Content-Length": "0", + "Ocp-Apim-Subscription-Key": key, + }) + + if allow_caching: + start_time = monotonic() + + try: + credential_response = urlopen(credential_request, timeout=60) # credential response can take longer, use longer timeout instead of default one + except HTTPError as e: + raise RequestError("credential request failed: {}".format(e.reason)) + except URLError as e: + raise RequestError("credential connection failed: {}".format(e.reason)) + access_token = credential_response.read().decode("utf-8") + + if allow_caching: + # save the token for the duration it is valid for + self.bing_cached_access_token = access_token + self.bing_cached_access_token_expiry = start_time + 600 # according to https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition, the token expires in exactly 10 minutes wav_data = audio_data.get_wav_data( - convert_rate = None if audio_data.sample_rate >= 8000 else 8000, # audio samples must be at least 8 kHz - convert_width = None if audio_data.sample_width in [2, 4] else 4 # audio samples should be either 16-bit or 32-bit + convert_rate=16000, # audio samples must be 8kHz or 16 kHz + convert_width=2 # audio samples should be 16-bit ) - url = "https://api.wit.ai/speech?v=20141022" - request = Request(url, data = wav_data, headers = {"Authorization": "Bearer {0}".format(key), "Content-Type": "audio/wav"}) + + url = "https://speech.platform.bing.com/speech/recognition/interactive/cognitiveservices/v1?{}".format(urlencode({ + "language": language, + "locale": language, + "requestid": uuid.uuid4(), + })) + + if sys.version_info >= (3, 6): # chunked-transfer requests are only supported in the standard library as of Python 3.6+, use it if possible + request = Request(url, data=io.BytesIO(wav_data), headers={ + "Authorization": "Bearer {}".format(access_token), + "Content-type": "audio/wav; codec=\"audio/pcm\"; samplerate=16000", + "Transfer-Encoding": "chunked", + }) + else: # fall back on manually formatting the POST body as a chunked request + ascii_hex_data_length = "{:X}".format(len(wav_data)).encode("utf-8") + chunked_transfer_encoding_data = ascii_hex_data_length + b"\r\n" + wav_data + b"\r\n0\r\n\r\n" + request = Request(url, data=chunked_transfer_encoding_data, headers={ + "Authorization": "Bearer {}".format(access_token), + "Content-type": "audio/wav; codec=\"audio/pcm\"; samplerate=16000", + "Transfer-Encoding": "chunked", + }) + try: - response = urlopen(request) + response = urlopen(request, timeout=self.operation_timeout) except HTTPError as e: - raise RequestError("recognition request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6 + raise RequestError("recognition request failed: {}".format(e.reason)) except URLError as e: - raise RequestError("recognition connection failed: {0}".format(e.reason)) + raise RequestError("recognition connection failed: {}".format(e.reason)) response_text = response.read().decode("utf-8") result = json.loads(response_text) # return results if show_all: return result - if "_text" not in result or result["_text"] is None: raise UnknownValueError() - return result["_text"] + if "RecognitionStatus" not in result or result["RecognitionStatus"] != "Success" or "DisplayText" not in result: raise UnknownValueError() + return result["DisplayText"] - def recognize_ibm(self, audio_data, username, password, language = "en-US", show_all = False): + def recognize_lex(self, audio_data, bot_name, bot_alias, user_id, content_type="audio/l16; rate=16000; channels=1", access_key_id=None, secret_access_key=None, region=None): """ - Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the IBM Speech to Text API. + Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Amazon Lex API. + + If access_key_id or secret_access_key is not set it will go through the list in the link below + http://boto3.readthedocs.io/en/latest/guide/configuration.html#configuring-credentials + """ + assert isinstance(audio_data, AudioData), "Data must be audio data" + assert isinstance(bot_name, str), "``bot_name`` must be a string" + assert isinstance(bot_alias, str), "``bot_alias`` must be a string" + assert isinstance(user_id, str), "``user_id`` must be a string" + assert isinstance(content_type, str), "``content_type`` must be a string" + assert access_key_id is None or isinstance(access_key_id, str), "``access_key_id`` must be a string" + assert secret_access_key is None or isinstance(secret_access_key, str), "``secret_access_key`` must be a string" + assert region is None or isinstance(region, str), "``region`` must be a string" + + try: + import boto3 + except ImportError: + raise RequestError("missing boto3 module: ensure that boto3 is set up correctly.") + + client = boto3.client('lex-runtime', aws_access_key_id=access_key_id, + aws_secret_access_key=secret_access_key, + region_name=region) - The IBM Speech to Text username and password are specified by ``username`` and ``password``, respectively. Unfortunately, these are not available without an account. IBM has published instructions for obtaining these credentials in the `IBM Watson Developer Cloud documentation `__. + raw_data = audio_data.get_raw_data( + convert_rate=16000, convert_width=2 + ) + + accept = "text/plain; charset=utf-8" + response = client.post_content(botName=bot_name, botAlias=bot_alias, userId=user_id, contentType=content_type, accept=accept, inputStream=raw_data) + + return response["inputTranscript"] + + def recognize_houndify(self, audio_data, client_id, client_key, show_all=False): + """ + Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Houndify API. + + The Houndify client ID and client key are specified by ``client_id`` and ``client_key``, respectively. Unfortunately, these are not available without `signing up for an account `__. Once logged into the `dashboard `__, you will want to select "Register a new client", and fill in the form as necessary. When at the "Enable Domains" page, enable the "Speech To Text Only" domain, and then select "Save & Continue". - The recognition language is determined by ``language``, an IETF language tag with a dialect like ``"en-US"`` or ``"es-ES"``, defaulting to US English. At the moment, this supports the tags ``"en-US"`` and ``"es-ES"``. + To get the client ID and client key for a Houndify client, go to the `dashboard `__ and select the client's "View Details" link. On the resulting page, the client ID and client key will be visible. Client IDs and client keys are both Base64-encoded strings. - Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response `__ as a JSON dictionary. + Currently, only English is supported as a recognition language. + + Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary. - Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if an error occurred, such as an invalid key, or a broken internet connection. + Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. """ assert isinstance(audio_data, AudioData), "Data must be audio data" - assert isinstance(username, str), "`username` must be a string" - assert isinstance(password, str), "`password` must be a string" - assert language in ["en-US", "es-ES"], "`language` must be a valid language." + assert isinstance(client_id, str), "``client_id`` must be a string" + assert isinstance(client_key, str), "``client_key`` must be a string" - flac_data = audio_data.get_flac_data( - convert_rate = None if audio_data.sample_rate >= 16000 else 16000 # audio samples should be at least 16 kHz + wav_data = audio_data.get_wav_data( + convert_rate=None if audio_data.sample_rate in [8000, 16000] else 16000, # audio samples must be 8 kHz or 16 kHz + convert_width=2 # audio samples should be 16-bit ) - model = "{0}_BroadbandModel".format(language) - url = "https://stream.watsonplatform.net/speech-to-text/api/v1/recognize?continuous=true&model={0}".format(model) - request = Request(url, data = flac_data, headers = {"Content-Type": "audio/x-flac"}) - if hasattr("", "encode"): - authorization_value = base64.standard_b64encode("{0}:{1}".format(username, password).encode("utf-8")).decode("utf-8") - else: - authorization_value = base64.standard_b64encode("{0}:{1}".format(username, password)) - request.add_header("Authorization", "Basic {0}".format(authorization_value)) + url = "https://api.houndify.com/v1/audio" + user_id, request_id = str(uuid.uuid4()), str(uuid.uuid4()) + request_time = str(int(time.time())) + request_signature = base64.urlsafe_b64encode( + hmac.new( + base64.urlsafe_b64decode(client_key), + user_id.encode("utf-8") + b";" + request_id.encode("utf-8") + request_time.encode("utf-8"), + hashlib.sha256 + ).digest() # get the HMAC digest as bytes + ).decode("utf-8") + request = Request(url, data=wav_data, headers={ + "Content-Type": "application/json", + "Hound-Request-Info": json.dumps({"ClientID": client_id, "UserID": user_id}), + "Hound-Request-Authentication": "{};{}".format(user_id, request_id), + "Hound-Client-Authentication": "{};{};{}".format(client_id, request_time, request_signature) + }) try: - response = urlopen(request) + response = urlopen(request, timeout=self.operation_timeout) except HTTPError as e: - raise RequestError("recognition request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6 + raise RequestError("recognition request failed: {}".format(e.reason)) except URLError as e: - raise RequestError("recognition connection failed: {0}".format(e.reason)) + raise RequestError("recognition connection failed: {}".format(e.reason)) response_text = response.read().decode("utf-8") result = json.loads(response_text) # return results if show_all: return result - if "results" not in result or len(result["results"]) < 1 or "alternatives" not in result["results"][0]: + if "Disambiguation" not in result or result["Disambiguation"] is None: raise UnknownValueError() - for entry in result["results"][0]["alternatives"]: - if "transcript" in entry: return entry["transcript"] - raise UnknownValueError() # no transcriptions available + return result['Disambiguation']['ChoiceData'][0]['Transcription'], result['Disambiguation']['ChoiceData'][0]['ConfidenceScore'] + + def recognize_amazon(self, audio_data, bucket_name=None, access_key_id=None, secret_access_key=None, region=None, job_name=None, file_key=None): + """ + Performs speech recognition on ``audio_data`` (an ``AudioData`` instance) using Amazon Transcribe. + https://aws.amazon.com/transcribe/ + If access_key_id or secret_access_key is not set it will go through the list in the link below + http://boto3.readthedocs.io/en/latest/guide/configuration.html#configuring-credentials + """ + assert access_key_id is None or isinstance(access_key_id, str), "``access_key_id`` must be a string" + assert secret_access_key is None or isinstance(secret_access_key, str), "``secret_access_key`` must be a string" + assert region is None or isinstance(region, str), "``region`` must be a string" + import multiprocessing + import traceback + import uuid + + from botocore.exceptions import ClientError + proc = multiprocessing.current_process() + + check_existing = audio_data is None and job_name + + bucket_name = bucket_name or ('%s-%s' % (str(uuid.uuid4()), proc.pid)) + job_name = job_name or ('%s-%s' % (str(uuid.uuid4()), proc.pid)) + + try: + import boto3 + except ImportError: + raise RequestError("missing boto3 module: ensure that boto3 is set up correctly.") + + transcribe = boto3.client( + 'transcribe', + aws_access_key_id=access_key_id, + aws_secret_access_key=secret_access_key, + region_name=region) + + s3 = boto3.client( + 's3', + aws_access_key_id=access_key_id, + aws_secret_access_key=secret_access_key, + region_name=region) + + session = boto3.Session( + aws_access_key_id=access_key_id, + aws_secret_access_key=secret_access_key, + region_name=region + ) + + # Upload audio data to S3. + filename = '%s.wav' % job_name + try: + # Bucket creation fails surprisingly often, even if the bucket exists. + # print('Attempting to create bucket %s...' % bucket_name) + s3.create_bucket(Bucket=bucket_name) + except ClientError as exc: + print('Error creating bucket %s: %s' % (bucket_name, exc)) + s3res = session.resource('s3') + if audio_data is not None: + print('Uploading audio data...') + wav_data = audio_data.get_wav_data() + s3.put_object(Bucket=bucket_name, Key=filename, Body=wav_data) + object_acl = s3res.ObjectAcl(bucket_name, filename) + object_acl.put(ACL='public-read') + else: + print('Skipping audio upload.') + job_uri = 'https://%s.s3.amazonaws.com/%s' % (bucket_name, filename) + + if check_existing: + + # Wait for job to complete. + try: + status = transcribe.get_transcription_job(TranscriptionJobName=job_name) + except ClientError as exc: + print('!' * 80) + print('Error getting job:', exc.response) + if exc.response['Error']['Code'] == 'BadRequestException' and "The requested job couldn't be found" in str(exc): + # Some error caused the job we recorded to not exist on AWS. + # Likely we were interrupted right after retrieving and deleting the job but before recording the transcript. + # Reset and try again later. + exc = TranscriptionNotReady() + exc.job_name = None + exc.file_key = None + raise exc + else: + # Some other error happened, so re-raise. + raise + + job = status['TranscriptionJob'] + if job['TranscriptionJobStatus'] in ['COMPLETED'] and 'TranscriptFileUri' in job['Transcript']: + + # Retrieve transcription JSON containing transcript. + transcript_uri = job['Transcript']['TranscriptFileUri'] + import json + import urllib.request + with urllib.request.urlopen(transcript_uri) as json_data: + d = json.load(json_data) + confidences = [] + for item in d['results']['items']: + confidences.append(float(item['alternatives'][0]['confidence'])) + confidence = 0.5 + if confidences: + confidence = sum(confidences) / float(len(confidences)) + transcript = d['results']['transcripts'][0]['transcript'] + + # Delete job. + try: + transcribe.delete_transcription_job(TranscriptionJobName=job_name) # cleanup + except Exception as exc: + print('Warning, could not clean up transcription: %s' % exc) + traceback.print_exc() + + # Delete S3 file. + s3.delete_object(Bucket=bucket_name, Key=filename) + + return transcript, confidence + elif job['TranscriptionJobStatus'] in ['FAILED']: + + # Delete job. + try: + transcribe.delete_transcription_job(TranscriptionJobName=job_name) # cleanup + except Exception as exc: + print('Warning, could not clean up transcription: %s' % exc) + traceback.print_exc() + + # Delete S3 file. + s3.delete_object(Bucket=bucket_name, Key=filename) + + exc = TranscriptionFailed() + exc.job_name = None + exc.file_key = None + raise exc + else: + # Keep waiting. + print('Keep waiting.') + exc = TranscriptionNotReady() + exc.job_name = job_name + exc.file_key = None + raise exc + + else: + + # Launch the transcription job. + try: + transcribe.start_transcription_job( + TranscriptionJobName=job_name, + Media={'MediaFileUri': job_uri}, + MediaFormat='wav', + LanguageCode='en-US' + ) + exc = TranscriptionNotReady() + exc.job_name = job_name + exc.file_key = None + raise exc + except ClientError as exc: + print('!' * 80) + print('Error starting job:', exc.response) + if exc.response['Error']['Code'] == 'LimitExceededException': + # Could not start job. Cancel everything. + s3.delete_object(Bucket=bucket_name, Key=filename) + exc = TranscriptionNotReady() + exc.job_name = None + exc.file_key = None + raise exc + else: + # Some other error happened, so re-raise. + raise - def recognize_att(self, audio_data, app_key, app_secret, language = "en-US", show_all = False): + def recognize_assemblyai(self, audio_data, api_token, job_name=None, **kwargs): + """ + Wraps the AssemblyAI STT service. + https://www.assemblyai.com/ """ - Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the AT&T Speech to Text API. - The AT&T Speech to Text app key and app secret are specified by ``app_key`` and ``app_secret``, respectively. Unfortunately, these are not available without `signing up for an account `__ and creating an app. + def read_file(filename, chunk_size=5242880): + with open(filename, 'rb') as _file: + while True: + data = _file.read(chunk_size) + if not data: + break + yield data + + import requests + + check_existing = audio_data is None and job_name + if check_existing: + # Query status. + transciption_id = job_name + endpoint = f"https://api.assemblyai.com/v2/transcript/{transciption_id}" + headers = { + "authorization": api_token, + } + response = requests.get(endpoint, headers=headers) + data = response.json() + status = data['status'] + + if status == 'error': + # Handle error. + exc = TranscriptionFailed() + exc.job_name = None + exc.file_key = None + raise exc + # Handle success. + elif status == 'completed': + confidence = data['confidence'] + text = data['text'] + return text, confidence + + # Otherwise keep waiting. + print('Keep waiting.') + exc = TranscriptionNotReady() + exc.job_name = job_name + exc.file_key = None + raise exc + else: + # Upload file. + headers = {'authorization': api_token} + response = requests.post('https://api.assemblyai.com/v2/upload', + headers=headers, + data=read_file(audio_data)) + upload_url = response.json()['upload_url'] + + # Queue file for transcription. + endpoint = "https://api.assemblyai.com/v2/transcript" + json = {"audio_url": upload_url} + headers = { + "authorization": api_token, + "content-type": "application/json" + } + response = requests.post(endpoint, json=json, headers=headers) + data = response.json() + transciption_id = data['id'] + exc = TranscriptionNotReady() + exc.job_name = transciption_id + exc.file_key = None + raise exc + + def recognize_ibm(self, audio_data, key, language="en-US", show_all=False): + """ + Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the IBM Speech to Text API. - To get the app key and app secret for an AT&T app, go to the `My Apps page `__ and look for "APP KEY" and "APP SECRET". AT&T app keys and app secrets are 32-character lowercase alphanumeric strings. + The IBM Speech to Text username and password are specified by ``username`` and ``password``, respectively. Unfortunately, these are not available without `signing up for an account `__. Once logged into the Bluemix console, follow the instructions for `creating an IBM Watson service instance `__, where the Watson service is "Speech To Text". IBM Speech to Text usernames are strings of the form XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX, while passwords are mixed-case alphanumeric strings. - The recognition language is determined by ``language``, an IETF language tag with a dialect like ``"en-US"`` or ``"es-ES"``, defaulting to US English. At the moment, this supports the tags ``"en-US"`` and ``"es-ES"``. + The recognition language is determined by ``language``, an RFC5646 language tag with a dialect like ``"en-US"`` (US English) or ``"zh-CN"`` (Mandarin Chinese), defaulting to US English. The supported language values are listed under the ``model`` parameter of the `audio recognition API documentation `__, in the form ``LANGUAGE_BroadbandModel``, where ``LANGUAGE`` is the language value. - Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response `__ as a JSON dictionary. + Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response `__ as a JSON dictionary. - Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the key isn't valid, or there is no internet connection. + Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. """ assert isinstance(audio_data, AudioData), "Data must be audio data" - assert isinstance(app_key, str), "`app_key` must be a string" - assert isinstance(app_secret, str), "`app_secret` must be a string" - assert language in ["en-US", "es-US"], "`language` must be a valid language." - - # ensure we have an authentication token - authorization_url = "https://api.att.com/oauth/v4/token" - authorization_body = "client_id={0}&client_secret={1}&grant_type=client_credentials&scope=SPEECH".format(app_key, app_secret) - try: - authorization_response = urlopen(authorization_url, data = authorization_body.encode("utf-8")) - except HTTPError as e: - raise RequestError("credential request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6 - except URLError as e: - raise RequestError("credential connection failed: {0}".format(e.reason)) - authorization_text = authorization_response.read().decode("utf-8") - authorization_bearer = json.loads(authorization_text).get("access_token") - if authorization_bearer is None: raise RequestError("missing OAuth access token in requested credentials") + assert isinstance(key, str), "``key`` must be a string" - wav_data = audio_data.get_wav_data( - convert_rate = 8000 if audio_data.sample_rate < 16000 else 16000, # audio samples should be either 8 kHz or 16 kHz - convert_width = 2 # audio samples should be 16-bit + flac_data = audio_data.get_flac_data( + convert_rate=None if audio_data.sample_rate >= 16000 else 16000, # audio samples should be at least 16 kHz + convert_width=None if audio_data.sample_width >= 2 else 2 # audio samples should be at least 16-bit ) - url = "https://api.att.com/speech/v3/speechToText" - request = Request(url, data = wav_data, headers = {"Authorization": "Bearer {0}".format(authorization_bearer), "Content-Language": language, "Content-Type": "audio/wav"}) + url = "https://gateway-wdc.watsonplatform.net/speech-to-text/api/v1/recognize" + request = Request(url, data=flac_data, headers={ + "Content-Type": "audio/x-flac", + }) + request.get_method = lambda: 'POST' + username = 'apikey' + password = key + authorization_value = base64.standard_b64encode("{}:{}".format(username, password).encode("utf-8")).decode("utf-8") + request.add_header("Authorization", "Basic {}".format(authorization_value)) try: - response = urlopen(request) + response = urlopen(request, timeout=self.operation_timeout) except HTTPError as e: - raise RequestError("recognition request failed: {0}".format(getattr(e, "reason", "status {0}".format(e.code)))) # use getattr to be compatible with Python 2.6 + raise RequestError("recognition request failed: {}".format(e.reason)) except URLError as e: - raise RequestError("recognition connection failed: {0}".format(e.reason)) + raise RequestError("recognition connection failed: {}".format(e.reason)) response_text = response.read().decode("utf-8") result = json.loads(response_text) # return results - if show_all: return result - if "Recognition" not in result or "NBest" not in result["Recognition"]: + if show_all: + return result + if "results" not in result or len(result["results"]) < 1 or "alternatives" not in result["results"][0]: raise UnknownValueError() - for entry in result["Recognition"]["NBest"]: - if entry.get("Grade") == "accept" and "ResultText" in entry: - return entry["ResultText"] - raise UnknownValueError() # no transcriptions available -def shutil_which(pgm): - """ - python2 backport of python3's shutil.which() - """ - path = os.getenv('PATH') - for p in path.split(os.path.pathsep): - p = os.path.join(p, pgm) - if os.path.exists(p) and os.access(p, os.X_OK): - return p + transcription = [] + confidence = None + for utterance in result["results"]: + if "alternatives" not in utterance: raise UnknownValueError() + for hypothesis in utterance["alternatives"]: + if "transcript" in hypothesis: + transcription.append(hypothesis["transcript"]) + confidence = hypothesis["confidence"] + break + return "\n".join(transcription), confidence + + lasttfgraph = '' + tflabels = None + + def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_actions_frozen.pb', tensor_label='tensorflow-data/conv_actions_labels.txt'): + """ + Performs speech recognition on ``audio_data`` (an ``AudioData`` instance). + + Path to Tensor loaded from ``tensor_graph``. You can download a model here: http://download.tensorflow.org/models/speech_commands_v0.01.zip + + Path to Tensor Labels file loaded from ``tensor_label``. + """ + assert isinstance(audio_data, AudioData), "Data must be audio data" + assert isinstance(tensor_graph, str), "``tensor_graph`` must be a string" + assert isinstance(tensor_label, str), "``tensor_label`` must be a string" + + try: + import tensorflow as tf + except ImportError: + raise RequestError("missing tensorflow module: ensure that tensorflow is set up correctly.") + + if not (tensor_graph == self.lasttfgraph): + self.lasttfgraph = tensor_graph + + # load graph + with tf.gfile.FastGFile(tensor_graph, 'rb') as f: + graph_def = tf.GraphDef() + graph_def.ParseFromString(f.read()) + tf.import_graph_def(graph_def, name='') + # load labels + self.tflabels = [line.rstrip() for line in tf.gfile.GFile(tensor_label)] + + wav_data = audio_data.get_wav_data( + convert_rate=16000, convert_width=2 + ) + + with tf.Session() as sess: + input_layer_name = 'wav_data:0' + output_layer_name = 'labels_softmax:0' + softmax_tensor = sess.graph.get_tensor_by_name(output_layer_name) + predictions, = sess.run(softmax_tensor, {input_layer_name: wav_data}) + + # Sort labels in order of confidence + top_k = predictions.argsort()[-1:][::-1] + for node_id in top_k: + human_string = self.tflabels[node_id] + return human_string + + def recognize_vosk(self, audio_data, language='en'): + from vosk import KaldiRecognizer, Model + + assert isinstance(audio_data, AudioData), "Data must be audio data" + + if not hasattr(self, 'vosk_model'): + if not os.path.exists("model"): + return "Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' in the current folder." + exit(1) + self.vosk_model = Model("model") + + rec = KaldiRecognizer(self.vosk_model, 16000) + + rec.AcceptWaveform(audio_data.get_raw_data(convert_rate=16000, convert_width=2)) + finalRecognition = rec.FinalResult() + + return finalRecognition + + +class PortableNamedTemporaryFile(object): + """Limited replacement for ``tempfile.NamedTemporaryFile``, except unlike ``tempfile.NamedTemporaryFile``, the file can be opened again while it's currently open, even on Windows.""" + def __init__(self, mode="w+b"): + self.mode = mode + + def __enter__(self): + # create the temporary file and open it + file_descriptor, file_path = tempfile.mkstemp() + self._file = os.fdopen(file_descriptor, self.mode) + + # the name property is a public field + self.name = file_path + return self + + def __exit__(self, exc_type, exc_value, traceback): + self._file.close() + os.remove(self.name) + + def write(self, *args, **kwargs): + return self._file.write(*args, **kwargs) + + def writelines(self, *args, **kwargs): + return self._file.writelines(*args, **kwargs) + + def flush(self, *args, **kwargs): + return self._file.flush(*args, **kwargs) + + +# During the pip install process, the 'import speech_recognition' command in setup.py is executed. +# At this time, the dependencies are not yet installed, resulting in a ModuleNotFoundError. +# This is a workaround to resolve this issue +try: + from .recognizers import google, google_cloud, pocketsphinx + from .recognizers.whisper_api import groq, openai + from .recognizers.whisper_local import faster_whisper, whisper +except (ModuleNotFoundError, ImportError): + pass +else: + Recognizer.recognize_google = google.recognize_legacy + Recognizer.recognize_google_cloud = google_cloud.recognize + Recognizer.recognize_whisper = whisper.recognize + Recognizer.recognize_faster_whisper = faster_whisper.recognize + Recognizer.recognize_openai = openai.recognize + Recognizer.recognize_groq = groq.recognize + Recognizer.recognize_sphinx = pocketsphinx.recognize + + +# =============================== +# backwards compatibility shims +# =============================== + +WavFile = AudioFile # WavFile was renamed to AudioFile in 3.4.1 + + +def recognize_api(self, audio_data, client_access_token, language="en", session_id=None, show_all=False): + wav_data = audio_data.get_wav_data(convert_rate=16000, convert_width=2) + url = "https://api.api.ai/v1/query" + while True: + boundary = uuid.uuid4().hex + if boundary.encode("utf-8") not in wav_data: break + if session_id is None: session_id = uuid.uuid4().hex + data = b"--" + boundary.encode("utf-8") + b"\r\n" + b"Content-Disposition: form-data; name=\"request\"\r\n" + b"Content-Type: application/json\r\n" + b"\r\n" + b"{\"v\": \"20150910\", \"sessionId\": \"" + session_id.encode("utf-8") + b"\", \"lang\": \"" + language.encode("utf-8") + b"\"}\r\n" + b"--" + boundary.encode("utf-8") + b"\r\n" + b"Content-Disposition: form-data; name=\"voiceData\"; filename=\"audio.wav\"\r\n" + b"Content-Type: audio/wav\r\n" + b"\r\n" + wav_data + b"\r\n" + b"--" + boundary.encode("utf-8") + b"--\r\n" + request = Request(url, data=data, headers={"Authorization": "Bearer {}".format(client_access_token), "Content-Length": str(len(data)), "Expect": "100-continue", "Content-Type": "multipart/form-data; boundary={}".format(boundary)}) + try: response = urlopen(request, timeout=10) + except HTTPError as e: raise RequestError("recognition request failed: {}".format(e.reason)) + except URLError as e: raise RequestError("recognition connection failed: {}".format(e.reason)) + response_text = response.read().decode("utf-8") + result = json.loads(response_text) + if show_all: return result + if "status" not in result or "errorType" not in result["status"] or result["status"]["errorType"] != "success": + raise UnknownValueError() + return result["result"]["resolvedQuery"] + + +Recognizer.recognize_api = classmethod(recognize_api) # API.AI Speech Recognition is deprecated/not recommended as of 3.5.0, and currently is only optionally available for paid plans diff --git a/speech_recognition/__main__.py b/speech_recognition/__main__.py index 649f42c3..faf89799 100644 --- a/speech_recognition/__main__.py +++ b/speech_recognition/__main__.py @@ -15,11 +15,7 @@ # recognize speech using Google Speech Recognition value = r.recognize_google(audio) - # we need some special handling here to correctly print unicode characters to standard output - if str is bytes: # this version of Python uses bytes for strings (Python 2) - print(u"You said {}".format(value).encode("utf-8")) - else: # this version of Python uses unicode for strings (Python 3+) - print("You said {}".format(value)) + print("You said {}".format(value)) except sr.UnknownValueError: print("Oops! Didn't catch that") except sr.RequestError as e: diff --git a/speech_recognition/audio.py b/speech_recognition/audio.py new file mode 100644 index 00000000..2322f36e --- /dev/null +++ b/speech_recognition/audio.py @@ -0,0 +1,318 @@ +import aifc +import audioop +import io +import os +import platform +import stat +import subprocess +import sys +import wave + + +class AudioData(object): + """ + Creates a new ``AudioData`` instance, which represents mono audio data. + + The raw audio data is specified by ``frame_data``, which is a sequence of bytes representing audio samples. This is the frame data structure used by the PCM WAV format. + + The width of each sample, in bytes, is specified by ``sample_width``. Each group of ``sample_width`` bytes represents a single audio sample. + + The audio data is assumed to have a sample rate of ``sample_rate`` samples per second (Hertz). + + Usually, instances of this class are obtained from ``recognizer_instance.record`` or ``recognizer_instance.listen``, or in the callback for ``recognizer_instance.listen_in_background``, rather than instantiating them directly. + """ + + def __init__(self, frame_data, sample_rate, sample_width): + assert sample_rate > 0, "Sample rate must be a positive integer" + assert ( + sample_width % 1 == 0 and 1 <= sample_width <= 4 + ), "Sample width must be between 1 and 4 inclusive" + self.frame_data = frame_data + self.sample_rate = sample_rate + self.sample_width = int(sample_width) + + def get_segment(self, start_ms=None, end_ms=None): + """ + Returns a new ``AudioData`` instance, trimmed to a given time interval. In other words, an ``AudioData`` instance with the same audio data except starting at ``start_ms`` milliseconds in and ending ``end_ms`` milliseconds in. + + If not specified, ``start_ms`` defaults to the beginning of the audio, and ``end_ms`` defaults to the end. + """ + assert ( + start_ms is None or start_ms >= 0 + ), "``start_ms`` must be a non-negative number" + assert end_ms is None or end_ms >= ( + 0 if start_ms is None else start_ms + ), "``end_ms`` must be a non-negative number greater or equal to ``start_ms``" + if start_ms is None: + start_byte = 0 + else: + start_byte = int( + (start_ms * self.sample_rate * self.sample_width) // 1000 + ) + if end_ms is None: + end_byte = len(self.frame_data) + else: + end_byte = int( + (end_ms * self.sample_rate * self.sample_width) // 1000 + ) + return AudioData( + self.frame_data[start_byte:end_byte], + self.sample_rate, + self.sample_width, + ) + + def get_raw_data(self, convert_rate=None, convert_width=None): + """ + Returns a byte string representing the raw frame data for the audio represented by the ``AudioData`` instance. + + If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. + + If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. + + Writing these bytes directly to a file results in a valid `RAW/PCM audio file `__. + """ + assert ( + convert_rate is None or convert_rate > 0 + ), "Sample rate to convert to must be a positive integer" + assert convert_width is None or ( + convert_width % 1 == 0 and 1 <= convert_width <= 4 + ), "Sample width to convert to must be between 1 and 4 inclusive" + + raw_data = self.frame_data + + # make sure unsigned 8-bit audio (which uses unsigned samples) is handled like higher sample width audio (which uses signed samples) + if self.sample_width == 1: + raw_data = audioop.bias( + raw_data, 1, -128 + ) # subtract 128 from every sample to make them act like signed samples + + # resample audio at the desired rate if specified + if convert_rate is not None and self.sample_rate != convert_rate: + raw_data, _ = audioop.ratecv( + raw_data, + self.sample_width, + 1, + self.sample_rate, + convert_rate, + None, + ) + + # convert samples to desired sample width if specified + if convert_width is not None and self.sample_width != convert_width: + if ( + convert_width == 3 + ): # we're converting the audio into 24-bit (workaround for https://bugs.python.org/issue12866) + raw_data = audioop.lin2lin( + raw_data, self.sample_width, 4 + ) # convert audio into 32-bit first, which is always supported + try: + audioop.bias( + b"", 3, 0 + ) # test whether 24-bit audio is supported (for example, ``audioop`` in Python 3.3 and below don't support sample width 3, while Python 3.4+ do) + except ( + audioop.error + ): # this version of audioop doesn't support 24-bit audio (probably Python 3.3 or less) + raw_data = b"".join( + raw_data[i + 1: i + 4] + for i in range(0, len(raw_data), 4) + ) # since we're in little endian, we discard the first byte from each 32-bit sample to get a 24-bit sample + else: # 24-bit audio fully supported, we don't need to shim anything + raw_data = audioop.lin2lin( + raw_data, self.sample_width, convert_width + ) + else: + raw_data = audioop.lin2lin( + raw_data, self.sample_width, convert_width + ) + + # if the output is 8-bit audio with unsigned samples, convert the samples we've been treating as signed to unsigned again + if convert_width == 1: + raw_data = audioop.bias( + raw_data, 1, 128 + ) # add 128 to every sample to make them act like unsigned samples again + + return raw_data + + def get_wav_data(self, convert_rate=None, convert_width=None): + """ + Returns a byte string representing the contents of a WAV file containing the audio represented by the ``AudioData`` instance. + + If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. + + If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. + + Writing these bytes directly to a file results in a valid `WAV file `__. + """ + raw_data = self.get_raw_data(convert_rate, convert_width) + sample_rate = ( + self.sample_rate if convert_rate is None else convert_rate + ) + sample_width = ( + self.sample_width if convert_width is None else convert_width + ) + + # generate the WAV file contents + with io.BytesIO() as wav_file: + wav_writer = wave.open(wav_file, "wb") + try: # note that we can't use context manager, since that was only added in Python 3.4 + wav_writer.setframerate(sample_rate) + wav_writer.setsampwidth(sample_width) + wav_writer.setnchannels(1) + wav_writer.writeframes(raw_data) + wav_data = wav_file.getvalue() + finally: # make sure resources are cleaned up + wav_writer.close() + return wav_data + + def get_aiff_data(self, convert_rate=None, convert_width=None): + """ + Returns a byte string representing the contents of an AIFF-C file containing the audio represented by the ``AudioData`` instance. + + If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. + + If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. + + Writing these bytes directly to a file results in a valid `AIFF-C file `__. + """ + raw_data = self.get_raw_data(convert_rate, convert_width) + sample_rate = ( + self.sample_rate if convert_rate is None else convert_rate + ) + sample_width = ( + self.sample_width if convert_width is None else convert_width + ) + + # the AIFF format is big-endian, so we need to convert the little-endian raw data to big-endian + if hasattr( + audioop, "byteswap" + ): # ``audioop.byteswap`` was only added in Python 3.4 + raw_data = audioop.byteswap(raw_data, sample_width) + else: # manually reverse the bytes of each sample, which is slower but works well enough as a fallback + raw_data = raw_data[sample_width - 1:: -1] + b"".join( + raw_data[i + sample_width: i: -1] + for i in range(sample_width - 1, len(raw_data), sample_width) + ) + + # generate the AIFF-C file contents + with io.BytesIO() as aiff_file: + aiff_writer = aifc.open(aiff_file, "wb") + try: # note that we can't use context manager, since that was only added in Python 3.4 + aiff_writer.setframerate(sample_rate) + aiff_writer.setsampwidth(sample_width) + aiff_writer.setnchannels(1) + aiff_writer.writeframes(raw_data) + aiff_data = aiff_file.getvalue() + finally: # make sure resources are cleaned up + aiff_writer.close() + return aiff_data + + def get_flac_data(self, convert_rate=None, convert_width=None): + """ + Returns a byte string representing the contents of a FLAC file containing the audio represented by the ``AudioData`` instance. + + Note that 32-bit FLAC is not supported. If the audio data is 32-bit and ``convert_width`` is not specified, then the resulting FLAC will be a 24-bit FLAC. + + If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match. + + If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match. + + Writing these bytes directly to a file results in a valid `FLAC file `__. + """ + assert convert_width is None or ( + convert_width % 1 == 0 and 1 <= convert_width <= 3 + ), "Sample width to convert to must be between 1 and 3 inclusive" + + if ( + self.sample_width > 3 and convert_width is None + ): # resulting WAV data would be 32-bit, which is not convertable to FLAC using our encoder + convert_width = 3 # the largest supported sample width is 24-bit, so we'll limit the sample width to that + + # run the FLAC converter with the WAV data to get the FLAC data + wav_data = self.get_wav_data(convert_rate, convert_width) + flac_converter = get_flac_converter() + if ( + os.name == "nt" + ): # on Windows, specify that the process is to be started without showing a console window + startup_info = subprocess.STARTUPINFO() + startup_info.dwFlags |= ( + subprocess.STARTF_USESHOWWINDOW + ) # specify that the wShowWindow field of `startup_info` contains a value + startup_info.wShowWindow = ( + subprocess.SW_HIDE + ) # specify that the console window should be hidden + else: + startup_info = None # default startupinfo + process = subprocess.Popen( + [ + flac_converter, + "--stdout", + "--totally-silent", # put the resulting FLAC file in stdout, and make sure it's not mixed with any program output + "--best", # highest level of compression available + "-", # the input FLAC file contents will be given in stdin + ], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + startupinfo=startup_info, + ) + flac_data, stderr = process.communicate(wav_data) + return flac_data + + +def get_flac_converter(): + """Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found.""" + flac_converter = shutil_which("flac") # check for installed version first + if flac_converter is None: # flac utility is not installed + base_path = os.path.dirname( + os.path.abspath(__file__) + ) # directory of the current module file, where all the FLAC bundled binaries are stored + system, machine = platform.system(), platform.machine() + if system == "Windows" and machine in { + "i686", + "i786", + "x86", + "x86_64", + "AMD64", + }: + flac_converter = os.path.join(base_path, "flac-win32.exe") + elif system == "Darwin" and machine in { + "i686", + "i786", + "x86", + "x86_64", + "AMD64", + "arm64", + }: + flac_converter = os.path.join(base_path, "flac-mac") + elif system == "Linux" and machine in {"i686", "i786", "x86"}: + flac_converter = os.path.join(base_path, "flac-linux-x86") + elif system == "Linux" and machine in {"x86_64", "AMD64"}: + flac_converter = os.path.join(base_path, "flac-linux-x86_64") + else: # no FLAC converter available + raise OSError( + "FLAC conversion utility not available - consider installing the FLAC command line application by running `apt-get install flac` or your operating system's equivalent" + ) + + # mark FLAC converter as executable if possible + try: + # handle known issue when running on docker: + # run executable right after chmod() may result in OSError "Text file busy" + # fix: flush FS with sync + if not os.access(flac_converter, os.X_OK): + stat_info = os.stat(flac_converter) + os.chmod(flac_converter, stat_info.st_mode | stat.S_IEXEC) + if "Linux" in platform.system(): + os.sync() if sys.version_info >= (3, 3) else os.system("sync") + + except OSError: + pass + + return flac_converter + + +def shutil_which(pgm): + """Python 2 compatibility: backport of ``shutil.which()`` from Python 3""" + path = os.getenv("PATH") + for p in path.split(os.path.pathsep): + p = os.path.join(p, pgm) + if os.path.exists(p) and os.access(p, os.X_OK): + return p diff --git a/speech_recognition/exceptions.py b/speech_recognition/exceptions.py new file mode 100644 index 00000000..3e208a12 --- /dev/null +++ b/speech_recognition/exceptions.py @@ -0,0 +1,22 @@ +class SetupError(Exception): + pass + + +class WaitTimeoutError(Exception): + pass + + +class RequestError(Exception): + pass + + +class UnknownValueError(Exception): + pass + + +class TranscriptionNotReady(Exception): + pass + + +class TranscriptionFailed(Exception): + pass diff --git a/speech_recognition/flac-linux-i386 b/speech_recognition/flac-linux-i386 deleted file mode 100755 index 9ebc8a48..00000000 Binary files a/speech_recognition/flac-linux-i386 and /dev/null differ diff --git a/speech_recognition/flac-linux-x86 b/speech_recognition/flac-linux-x86 new file mode 100644 index 00000000..6db17556 Binary files /dev/null and b/speech_recognition/flac-linux-x86 differ diff --git a/speech_recognition/flac-linux-x86_64 b/speech_recognition/flac-linux-x86_64 new file mode 100755 index 00000000..285b4e35 Binary files /dev/null and b/speech_recognition/flac-linux-x86_64 differ diff --git a/speech_recognition/flac-mac b/speech_recognition/flac-mac index 5437a99f..be3e975f 100755 Binary files a/speech_recognition/flac-mac and b/speech_recognition/flac-mac differ diff --git a/speech_recognition/flac-win32.exe b/speech_recognition/flac-win32.exe index 5e878aad..a83210df 100755 Binary files a/speech_recognition/flac-win32.exe and b/speech_recognition/flac-win32.exe differ diff --git a/speech_recognition/recognizers/__init__.py b/speech_recognition/recognizers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/speech_recognition/recognizers/google.py b/speech_recognition/recognizers/google.py new file mode 100644 index 00000000..17f0d12c --- /dev/null +++ b/speech_recognition/recognizers/google.py @@ -0,0 +1,262 @@ +from __future__ import annotations + +import json +from typing import Dict, Literal, TypedDict +from urllib.error import HTTPError, URLError +from urllib.parse import urlencode +from urllib.request import Request, urlopen + +from typing_extensions import NotRequired + +from speech_recognition.audio import AudioData +from speech_recognition.exceptions import RequestError, UnknownValueError + + +class Alternative(TypedDict): + transcript: str + confidence: float + + +class Result(TypedDict): + alternative: list[Alternative] + final: bool + + +class GoogleResponse(TypedDict): + result: list[Result] + result_index: NotRequired[int] + + +ProfanityFilterLevel = Literal[0, 1] +RequestHeaders = Dict[str, str] + +ENDPOINT = "http://www.google.com/speech-api/v2/recognize" + + +class RequestBuilder: + def __init__( + self, + *, + endpoint: str, + key: str, + language: str, + filter_level: ProfanityFilterLevel, + ) -> None: + self.endpoint = endpoint + self.key = key + self.language = language + self.filter_level = filter_level + + def build(self, audio_data: AudioData) -> Request: + if not isinstance(audio_data, AudioData): + raise ValueError("``audio_data`` must be audio data") + + url = self.build_url() + headers = self.build_headers(audio_data) + flac_data = self.build_data(audio_data) + request = Request(url, data=flac_data, headers=headers) + return request + + def build_url(self) -> str: + """ + >>> builder = RequestBuilder(endpoint="http://www.google.com/speech-api/v2/recognize", key="awesome-key", language="en-US", filter_level=0) + >>> builder.build_url() + 'http://www.google.com/speech-api/v2/recognize?client=chromium&lang=en-US&key=awesome-key&pFilter=0' + """ + params = urlencode( + { + "client": "chromium", + "lang": self.language, + "key": self.key, + "pFilter": self.filter_level, + } + ) + return f"{self.endpoint}?{params}" + + def build_headers(self, audio_data: AudioData) -> RequestHeaders: + """ + >>> builder = RequestBuilder(endpoint="", key="", language="", filter_level=1) + >>> audio_data = AudioData(b"", 16_000, 1) + >>> builder.build_headers(audio_data) + {'Content-Type': 'audio/x-flac; rate=16000'} + """ + rate = audio_data.sample_rate + headers = {"Content-Type": f"audio/x-flac; rate={rate}"} + return headers + + def build_data(self, audio_data: AudioData) -> bytes: + flac_data = audio_data.get_flac_data( + convert_rate=self.to_convert_rate(audio_data.sample_rate), + convert_width=2, # audio samples must be 16-bit + ) + return flac_data + + @staticmethod + def to_convert_rate(sample_rate: int) -> int: + """Audio samples must be at least 8 kHz + + >>> RequestBuilder.to_convert_rate(16_000) + >>> RequestBuilder.to_convert_rate(8_000) + >>> RequestBuilder.to_convert_rate(7_999) + 8000 + """ + return None if sample_rate >= 8000 else 8000 + + +def create_request_builder( + *, + endpoint: str, + key: str | None = None, + language: str = "en-US", + filter_level: ProfanityFilterLevel = 0, +) -> RequestBuilder: + if not isinstance(language, str): + raise ValueError("``language`` must be a string") + if key is not None and not isinstance(key, str): + raise ValueError("``key`` must be ``None`` or a string") + + if key is None: + key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw" + return RequestBuilder( + endpoint=endpoint, + key=key, + language=language, + filter_level=filter_level, + ) + + +class OutputParser: + def __init__(self, *, show_all: bool, with_confidence: bool) -> None: + self.show_all = show_all + self.with_confidence = with_confidence + + def parse(self, response_text: str): + actual_result = self.convert_to_result(response_text) + if self.show_all: + return actual_result + + best_hypothesis = self.find_best_hypothesis( + actual_result["alternative"] + ) + # https://cloud.google.com/speech-to-text/docs/basics#confidence-values + # "Your code should not require the confidence field as it is not guaranteed to be accurate, or even set, in any of the results." + confidence = best_hypothesis.get("confidence", 0.5) + if self.with_confidence: + return best_hypothesis["transcript"], confidence + return best_hypothesis["transcript"] + + @staticmethod + def convert_to_result(response_text: str) -> Result: + r""" + >>> response_text = '''{"result":[]} + ... {"result":[{"alternative":[{"transcript":"one two three","confidence":0.49585345},{"transcript":"1 2","confidence":0.42899391}],"final":true}],"result_index":0} + ... ''' + >>> OutputParser.convert_to_result(response_text) + {'alternative': [{'transcript': 'one two three', 'confidence': 0.49585345}, {'transcript': '1 2', 'confidence': 0.42899391}], 'final': True} + + >>> OutputParser.convert_to_result("") + Traceback (most recent call last): + ... + speech_recognition.exceptions.UnknownValueError + >>> OutputParser.convert_to_result('\n{"result":[]}') + Traceback (most recent call last): + ... + speech_recognition.exceptions.UnknownValueError + >>> OutputParser.convert_to_result('{"result":[{"foo": "bar"}]}') + Traceback (most recent call last): + ... + speech_recognition.exceptions.UnknownValueError + >>> OutputParser.convert_to_result('{"result":[{"alternative": []}]}') + Traceback (most recent call last): + ... + speech_recognition.exceptions.UnknownValueError + """ + # ignore any blank blocks + for line in response_text.split("\n"): + if not line: + continue + result: list[Result] = json.loads(line)["result"] + if len(result) != 0: + if len(result[0].get("alternative", [])) == 0: + raise UnknownValueError() + return result[0] + raise UnknownValueError() + + @staticmethod + def find_best_hypothesis(alternatives: list[Alternative]) -> Alternative: + """ + >>> alternatives = [{"transcript": "one two three", "confidence": 0.42899391}, {"transcript": "1 2", "confidence": 0.49585345}] + >>> OutputParser.find_best_hypothesis(alternatives) + {'transcript': 'one two three', 'confidence': 0.42899391} + + >>> alternatives = [{"confidence": 0.49585345}] + >>> OutputParser.find_best_hypothesis(alternatives) + Traceback (most recent call last): + ... + speech_recognition.exceptions.UnknownValueError + """ + if "confidence" in alternatives: + # BUG: actual_result["alternative"] (=alternatives) is list, not dict + # return alternative with highest confidence score + best_hypothesis: Alternative = max( + alternatives, + key=lambda alternative: alternative["confidence"], + ) + else: + # when there is no confidence available, we arbitrarily choose the first hypothesis. + best_hypothesis: Alternative = alternatives[0] + if "transcript" not in best_hypothesis: + raise UnknownValueError() + return best_hypothesis + + +def obtain_transcription(request: Request, timeout: int) -> str: + try: + response = urlopen(request, timeout=timeout) + except HTTPError as e: + raise RequestError("recognition request failed: {}".format(e.reason)) + except URLError as e: + raise RequestError( + "recognition connection failed: {}".format(e.reason) + ) + return response.read().decode("utf-8") + + +def recognize_legacy( + recognizer, + audio_data: AudioData, + key: str | None = None, + language: str = "en-US", + pfilter: ProfanityFilterLevel = 0, + show_all: bool = False, + with_confidence: bool = False, + *, + endpoint: str = ENDPOINT, +): + """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Speech Recognition API. + + The Google Speech Recognition API key is specified by ``key``. If not specified, it uses a generic key that works out of the box. This should generally be used for personal or testing purposes only, as it **may be revoked by Google at any time**. + + To obtain your own API key, simply following the steps on the `API Keys `__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API". + + The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language tags can be found in this `StackOverflow answer `__. + + The profanity filter level can be adjusted with ``pfilter``: 0 - No filter, 1 - Only shows the first character and replaces the rest with asterisks. The default is level 0. + + Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary. + + Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. + """ + request_builder = create_request_builder( + endpoint=endpoint, key=key, language=language, filter_level=pfilter + ) + request = request_builder.build(audio_data) + + response_text = obtain_transcription( + request, timeout=recognizer.operation_timeout + ) + + output_parser = OutputParser( + show_all=show_all, with_confidence=with_confidence + ) + return output_parser.parse(response_text) diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py new file mode 100644 index 00000000..5c5a7f62 --- /dev/null +++ b/speech_recognition/recognizers/google_cloud.py @@ -0,0 +1,142 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, TypedDict +from urllib.error import URLError + +from speech_recognition.audio import AudioData +from speech_recognition.exceptions import RequestError, UnknownValueError + +if TYPE_CHECKING: + from google.cloud.speech import ( + RecognitionConfig, + RecognizeResponse, + SpeechContext, + ) + from typing_extensions import Required + + +class GoogleCloudRecognizerParameters(TypedDict, total=False): + """Optional parameters. + + The recognition language is determined by ``language_code``, which is a BCP-47 language tag like ``"en-US"`` (US English). Default: ``"en-US"``. + A list of supported language tags can be found in the `Speech-to-Text supported languages `__. + + If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. + This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. + Note that the API imposes certain `restrictions on the list of phrase strings `__. + + ``show_all``: See :py:func:`recognize`. + + ``model``: You can select the model to get best results. (See `RecognitionConfig's documentation `__ for detail) + + ``use_enhanced``: Set to true to use an enhanced model for speech recognition. + """ + + # SpeechRecognition specific parameters + preferred_phrases: list[str] + show_all: bool + + # Speech-to-Text V1 API's parameters + language_code: str + model: str + use_enhanced: bool + # TODO Add others support + + +class GoogleCloudSpeechV1Parameters(TypedDict, total=False): + """Speech-to-Text V1 API's parameters. + + https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig + """ + + encoding: Required[RecognitionConfig.AudioEncoding] + sample_rate_hertz: Required[int] + language_code: Required[str] + speech_contexts: list[SpeechContext] + enable_word_time_offsets: bool + model: str + use_enhanced: bool + + +def _build_config( + audio_data: AudioData, recognizer_params: GoogleCloudRecognizerParameters +) -> RecognitionConfig: + from google.cloud import speech + + parameters: GoogleCloudSpeechV1Parameters = { + "encoding": speech.RecognitionConfig.AudioEncoding.FLAC, + "sample_rate_hertz": audio_data.sample_rate, + "language_code": recognizer_params.pop("language_code", "en-US"), + } + if preferred_phrases := recognizer_params.pop("preferred_phrases", None): + parameters["speech_contexts"] = [ + speech.SpeechContext(phrases=preferred_phrases) + ] + if recognizer_params.pop("show_all", False): + # ref: https://cloud.google.com/speech-to-text/docs/async-time-offsets + parameters["enable_word_time_offsets"] = True + return speech.RecognitionConfig(**(parameters | recognizer_params)) + + +def recognize( + recognizer, + audio_data: AudioData, + credentials_json_path: str | None = None, + **kwargs: GoogleCloudRecognizerParameters, +) -> str | RecognizeResponse: + """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API. + + This function requires a Google Cloud Platform account; see the `Set up Speech-to-Text `__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project. + And create local authentication credentials for your user account. The result is a JSON file containing the API credentials. You can specify the JSON file by ``credentials_json_path``. If not specified, the library will try to automatically `find the default API credentials JSON file `__. + + Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary. + For other parameters, see :py:class:`GoogleCloudRecognizerParameters`. + + Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection. + """ + try: + from google.api_core.exceptions import GoogleAPICallError + from google.cloud import speech + except ImportError: + raise RequestError( + "missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly." + ) + + client = ( + speech.SpeechClient.from_service_account_json(credentials_json_path) + if credentials_json_path + else speech.SpeechClient() + ) + + flac_data = audio_data.get_flac_data( + # audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range + convert_rate=( + None + if 8000 <= audio_data.sample_rate <= 48000 + else max(8000, min(audio_data.sample_rate, 48000)) + ), + convert_width=2, # audio samples must be 16-bit + ) + audio = speech.RecognitionAudio(content=flac_data) + + config = _build_config(audio_data, kwargs.copy()) + + try: + response = client.recognize(config=config, audio=audio) + except GoogleAPICallError as e: + raise RequestError(e) + except URLError as e: + raise RequestError( + "recognition connection failed: {0}".format(e.reason) + ) + + if kwargs.get("show_all"): + return response + if len(response.results) == 0: + raise UnknownValueError() + + transcript = " ".join( + result.alternatives[0].transcript.strip() + for result in response.results + ) + return transcript diff --git a/speech_recognition/recognizers/pocketsphinx.py b/speech_recognition/recognizers/pocketsphinx.py new file mode 100644 index 00000000..6092cba2 --- /dev/null +++ b/speech_recognition/recognizers/pocketsphinx.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +import os +from collections.abc import Sequence + +from speech_recognition import PortableNamedTemporaryFile +from speech_recognition.audio import AudioData +from speech_recognition.exceptions import RequestError, UnknownValueError + +AcousticParametersDirectoryPath = str +LanguageModelFilePath = str +PhonemeDictionaryFilePath = str +SphinxDataFilePaths = tuple[AcousticParametersDirectoryPath, LanguageModelFilePath, PhonemeDictionaryFilePath] + +Keyword = str +Sensitivity = float +KeywordEntry = tuple[Keyword, Sensitivity] + + +def recognize( + recognizer, + audio_data: AudioData, + language: str | SphinxDataFilePaths = "en-US", + keyword_entries: Sequence[KeywordEntry] | None = None, + grammar: str | None = None, + show_all: bool = False, +): + """ + Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx. + + The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. Out of the box, only ``en-US`` is supported. See `Notes on using `PocketSphinx `__ for information about installing other languages. This document is also included under ``reference/pocketsphinx.rst``. The ``language`` parameter can also be a tuple of filesystem paths, of the form ``(acoustic_parameters_directory, language_model_file, phoneme_dictionary_file)`` - this allows you to load arbitrary Sphinx models. + + If specified, the keywords to search for are determined by ``keyword_entries``, an iterable of tuples of the form ``(keyword, sensitivity)``, where ``keyword`` is a phrase, and ``sensitivity`` is how sensitive to this phrase the recognizer should be, on a scale of 0 (very insensitive, more false negatives) to 1 (very sensitive, more false positives) inclusive. If not specified or ``None``, no keywords are used and Sphinx will simply transcribe whatever words it recognizes. Specifying ``keyword_entries`` is more accurate than just looking for those same keywords in non-keyword-based transcriptions, because Sphinx knows specifically what sounds to look for. + + Sphinx can also handle FSG or JSGF grammars. The parameter ``grammar`` expects a path to the grammar file. Note that if a JSGF grammar is passed, an FSG grammar will be created at the same location to speed up execution in the next run. If ``keyword_entries`` are passed, content of ``grammar`` will be ignored. + + Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Decoder`` object resulting from the recognition. + + Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation. + """ + # TODO Move this validation into KeywordEntry initialization + assert keyword_entries is None or all(isinstance(keyword, (type(""), type(u""))) and 0 <= sensitivity <= 1 for keyword, sensitivity in keyword_entries), "``keyword_entries`` must be ``None`` or a list of pairs of strings and numbers between 0 and 1" + + try: + from pocketsphinx import FsgModel, Jsgf, pocketsphinx + except ImportError: + raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.") + + if isinstance(language, str): # directory containing language data + language_directory = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "pocketsphinx-data", language) + if not os.path.isdir(language_directory): + raise RequestError("missing PocketSphinx language data directory: \"{}\"".format(language_directory)) + acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model") + language_model_file = os.path.join(language_directory, "language-model.lm.bin") + phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict") + else: # 3-tuple of Sphinx data file paths + acoustic_parameters_directory, language_model_file, phoneme_dictionary_file = language + if not os.path.isdir(acoustic_parameters_directory): + raise RequestError("missing PocketSphinx language model parameters directory: \"{}\"".format(acoustic_parameters_directory)) + if not os.path.isfile(language_model_file): + raise RequestError("missing PocketSphinx language model file: \"{}\"".format(language_model_file)) + if not os.path.isfile(phoneme_dictionary_file): + raise RequestError("missing PocketSphinx phoneme dictionary file: \"{}\"".format(phoneme_dictionary_file)) + + # create decoder object + config = pocketsphinx.Config() + config.set_string("-hmm", acoustic_parameters_directory) # set the path of the hidden Markov model (HMM) parameter files + config.set_string("-lm", language_model_file) + config.set_string("-dict", phoneme_dictionary_file) + config.set_string("-logfn", os.devnull) # disable logging (logging causes unwanted output in terminal) + decoder = pocketsphinx.Decoder(config) + + # obtain audio data + raw_data = audio_data.get_raw_data(convert_rate=16000, convert_width=2) # the included language models require audio to be 16-bit mono 16 kHz in little-endian format + + # obtain recognition results + if keyword_entries is not None: # explicitly specified set of keywords + with PortableNamedTemporaryFile("w") as f: + # generate a keywords file - Sphinx documentation recommendeds sensitivities between 1e-50 and 1e-5 + f.writelines("{} /1e{}/\n".format(keyword, 100 * sensitivity - 110) for keyword, sensitivity in keyword_entries) + f.flush() + + # perform the speech recognition with the keywords file (this is inside the context manager so the file isn;t deleted until we're done) + decoder.add_kws("keywords", f.name) + decoder.activate_search("keywords") + elif grammar is not None: # a path to a FSG or JSGF grammar + if not os.path.exists(grammar): + raise ValueError("Grammar '{0}' does not exist.".format(grammar)) + grammar_path = os.path.abspath(os.path.dirname(grammar)) + grammar_name = os.path.splitext(os.path.basename(grammar))[0] + fsg_path = "{0}/{1}.fsg".format(grammar_path, grammar_name) + if not os.path.exists(fsg_path): # create FSG grammar if not available + jsgf = Jsgf(grammar) + rule = jsgf.get_rule("{0}.{0}".format(grammar_name)) + fsg = jsgf.build_fsg(rule, decoder.get_logmath(), 7.5) + fsg.writefile(fsg_path) + else: + fsg = FsgModel(fsg_path, decoder.get_logmath(), 7.5) + decoder.set_fsg(grammar_name, fsg) + decoder.set_search(grammar_name) + + decoder.start_utt() # begin utterance processing + decoder.process_raw(raw_data, False, True) # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True) + decoder.end_utt() # stop utterance processing + + if show_all: return decoder + + # return results + hypothesis = decoder.hyp() + if hypothesis is not None: return hypothesis.hypstr + raise UnknownValueError() # no transcriptions available diff --git a/speech_recognition/recognizers/whisper_api/__init__.py b/speech_recognition/recognizers/whisper_api/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/speech_recognition/recognizers/whisper_api/base.py b/speech_recognition/recognizers/whisper_api/base.py new file mode 100644 index 00000000..c435ef59 --- /dev/null +++ b/speech_recognition/recognizers/whisper_api/base.py @@ -0,0 +1,22 @@ +from io import BytesIO + +from speech_recognition.audio import AudioData + + +class OpenAICompatibleRecognizer: + def __init__(self, client) -> None: + self.client = client + + def recognize(self, audio_data: "AudioData", model: str, **kwargs) -> str: + if not isinstance(audio_data, AudioData): + raise ValueError( + "``audio_data`` must be an ``AudioData`` instance" + ) + + wav_data = BytesIO(audio_data.get_wav_data()) + wav_data.name = "SpeechRecognition_audio.wav" + + transcript = self.client.audio.transcriptions.create( + file=wav_data, model=model, **kwargs + ) + return transcript.text diff --git a/speech_recognition/recognizers/whisper_api/groq.py b/speech_recognition/recognizers/whisper_api/groq.py new file mode 100644 index 00000000..9beccf6b --- /dev/null +++ b/speech_recognition/recognizers/whisper_api/groq.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from typing import Literal, TypedDict + +from typing_extensions import Unpack + +from speech_recognition.audio import AudioData +from speech_recognition.exceptions import SetupError +from speech_recognition.recognizers.whisper_api.base import ( + OpenAICompatibleRecognizer, +) + +# https://console.groq.com/docs/speech-text#supported-models +GroqModel = Literal[ + "whisper-large-v3-turbo", "whisper-large-v3", "distil-whisper-large-v3-en" +] + + +class GroqOptionalParameters(TypedDict): + """Groq speech transcription's optional parameters. + + https://console.groq.com/docs/speech-text#transcription-endpoint-usage + """ + + prompt: str + response_format: str + temperature: float + language: str + + +def recognize( + recognizer, + audio_data: "AudioData", + *, + model: GroqModel = "whisper-large-v3-turbo", + **kwargs: Unpack[GroqOptionalParameters], +) -> str: + """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Groq Whisper API. + + This function requires login to Groq; visit https://console.groq.com/login, then generate API Key in `API Keys `__ menu. + + Detail: https://console.groq.com/docs/speech-text + + Set environment variable ``GROQ_API_KEY``; otherwise groq library will raise a ``groq.GroqError``. + """ + try: + import groq + except ImportError: + raise SetupError( + "missing groq module: ensure that groq is set up correctly." + ) + + groq_recognizer = OpenAICompatibleRecognizer(groq.Groq()) + return groq_recognizer.recognize(audio_data, model, **kwargs) diff --git a/speech_recognition/recognizers/whisper_api/openai.py b/speech_recognition/recognizers/whisper_api/openai.py new file mode 100644 index 00000000..0208a5bf --- /dev/null +++ b/speech_recognition/recognizers/whisper_api/openai.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from typing import Literal + +from typing_extensions import Unpack + +from speech_recognition.audio import AudioData +from speech_recognition.exceptions import SetupError +from speech_recognition.recognizers.whisper_api.base import ( + OpenAICompatibleRecognizer, +) + +# https://platform.openai.com/docs/api-reference/audio/createTranscription#audio-createtranscription-model +WhisperModel = Literal["whisper-1"] + + +class OpenAIOptionalParameters: + """OpenAI speech transcription's optional parameters. + + https://platform.openai.com/docs/api-reference/audio/createTranscription + """ + + language: str + prompt: str + # TODO Add support `Literal["text", "srt", "verbose_json", "vtt"]` + response_format: Literal["json"] + temperature: float + # timestamp_granularities # TODO support + + +def recognize( + recognizer, + audio_data: "AudioData", + *, + model: WhisperModel = "whisper-1", + **kwargs: Unpack[OpenAIOptionalParameters], +) -> str: + """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the OpenAI Whisper API. + + This function requires an OpenAI account; visit https://platform.openai.com/signup, then generate API Key in `User settings `__. + + Detail: https://platform.openai.com/docs/guides/speech-to-text + + Set environment variable ``OPENAI_API_KEY``; otherwise openai library will raise a ``openai.OpenAIError``. + """ + try: + import openai + except ImportError: + raise SetupError( + "missing openai module: ensure that openai is set up correctly." + ) + + openai_recognizer = OpenAICompatibleRecognizer(openai.OpenAI()) + return openai_recognizer.recognize(audio_data, model, **kwargs) diff --git a/speech_recognition/recognizers/whisper_local/__init__.py b/speech_recognition/recognizers/whisper_local/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/speech_recognition/recognizers/whisper_local/base.py b/speech_recognition/recognizers/whisper_local/base.py new file mode 100644 index 00000000..ad6ee101 --- /dev/null +++ b/speech_recognition/recognizers/whisper_local/base.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +import io +from typing import TYPE_CHECKING, Any, Protocol + +from speech_recognition.audio import AudioData + +if TYPE_CHECKING: + import numpy as np + + +class Transcribable(Protocol): + def transcribe( + self, audio_array: np.ndarray, **kwargs + ) -> str | dict[str, Any]: + pass + + +class WhisperCompatibleRecognizer: + def __init__(self, model: Transcribable) -> None: + self.model = model + + def recognize( + self, audio_data: AudioData, show_dict: bool = False, **kwargs + ): + if not isinstance(audio_data, AudioData): + raise ValueError( + "``audio_data`` must be an ``AudioData`` instance" + ) + + import numpy as np + import soundfile as sf + + # 16 kHz https://github.com/openai/whisper/blob/28769fcfe50755a817ab922a7bc83483159600a9/whisper/audio.py#L98-L99 + wav_bytes = audio_data.get_wav_data(convert_rate=16000) + wav_stream = io.BytesIO(wav_bytes) + audio_array, sampling_rate = sf.read(wav_stream) + audio_array = audio_array.astype(np.float32) + + result = self.model.transcribe(audio_array, **kwargs) + + if show_dict: + return result + else: + return result["text"] diff --git a/speech_recognition/recognizers/whisper_local/faster_whisper.py b/speech_recognition/recognizers/whisper_local/faster_whisper.py new file mode 100644 index 00000000..e7ce10fb --- /dev/null +++ b/speech_recognition/recognizers/whisper_local/faster_whisper.py @@ -0,0 +1,106 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict + +from speech_recognition.audio import AudioData +from speech_recognition.recognizers.whisper_local.base import ( + WhisperCompatibleRecognizer, +) + +if TYPE_CHECKING: + import numpy as np + from faster_whisper import WhisperModel + from faster_whisper.transcribe import Segment + from typing_extensions import Unpack + + +class TranscribeOutput(TypedDict): + text: str + segments: list[Segment] + language: str + + +class TranscribableAdapter: + def __init__(self, model: WhisperModel) -> None: + self.model = model + + def transcribe( + self, audio_array: np.ndarray, **kwargs + ) -> TranscribeOutput: + segments_generator, info = self.model.transcribe(audio_array, **kwargs) + segments = list(segments_generator) + return { + "text": " ".join(segment.text for segment in segments), + "segments": segments, + "language": info.language, + } + + +class InitOptionalParameters(TypedDict, total=False): + # https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L575 + device: Literal["cpu", "gpu", "auto"] + compute_type: str + download_root: str + # TODO Add others + + +class TranscribeOptionalParameters(TypedDict, total=False): + # https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L692 + language: str + task: Literal["transcribe", "translate"] + beam_size: int + # TODO Add others + + +def recognize( + recognizer, + audio_data: AudioData, + model: str = "base", + show_dict: bool = False, + init_options: InitOptionalParameters | None = None, + **transcribe_options: Unpack[TranscribeOptionalParameters], +) -> str | TranscribeOutput: + """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper. + + Pick ``model`` size (Same as Whisper). + + If ``show_dict`` is true, returns the detailed response from Whisper, including the detected language. Otherwise returns only the transcription. + + You can specify: + + * ``language``: recognition language, an uncapitalized 2 letters language name like "en" or "fr". + + * If not set, Faster Whisper will automatically detect the language. + + * ``task`` + + * If you want transcribe + **translate** to english, set ``task="translate"``. + + Other values are passed directly to whisper. See https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py for all options. + """ + from faster_whisper import WhisperModel + + model = WhisperModel(model, **init_options or {}) + whisper_recognizer = WhisperCompatibleRecognizer( + TranscribableAdapter(model) + ) + return whisper_recognizer.recognize( + audio_data, show_dict=show_dict, **transcribe_options + ) + + +if __name__ == "__main__": + import argparse + + import speech_recognition as sr + + parser = argparse.ArgumentParser() + parser.add_argument("audio_file") + args = parser.parse_args() + + r = sr.Recognizer() + with sr.AudioFile(args.audio_file) as source: + audio_data = r.listen(source) + + transcription = recognize(None, audio_data) + print(transcription) diff --git a/speech_recognition/recognizers/whisper_local/whisper.py b/speech_recognition/recognizers/whisper_local/whisper.py new file mode 100644 index 00000000..622ee071 --- /dev/null +++ b/speech_recognition/recognizers/whisper_local/whisper.py @@ -0,0 +1,108 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict + +from speech_recognition.audio import AudioData +from speech_recognition.recognizers.whisper_local.base import ( + WhisperCompatibleRecognizer, +) + +if TYPE_CHECKING: + import numpy as np + import torch + from typing_extensions import Unpack + from whisper import Whisper + + +class LoadModelOptionalParameters(TypedDict, total=False): + # ref: https://github.com/openai/whisper/blob/v20240930/whisper/__init__.py#L103 + device: str | torch.device + download_root: str + in_memory: bool + + +class TranscribeOptionalParameters(TypedDict, total=False): + """Transcribe optional parameters & DecodingOptions parameters.""" + + # ref: https://github.com/openai/whisper/blob/v20240930/whisper/transcribe.py#L38 + temperature: float | tuple[float, ...] + # TODO Add others + + # ref: https://github.com/openai/whisper/blob/v20240930/whisper/decoding.py#L81 + task: Literal["transcribe", "translate"] + language: str + fp16: bool + # TODO Add others + + +class Segment(TypedDict): + id: int + seek: int + start: float + end: float + text: str + tokens: list[int] + temperature: float + avg_logprob: float + compression_ratio: float + no_speech_prob: float + + +class TranscribeOutput(TypedDict): + text: str + segments: list[Segment] + language: str + + +class TranscribableAdapter: + def __init__(self, model: Whisper) -> None: + self.model = model + + def transcribe( + self, audio_array: np.ndarray, **kwargs + ) -> TranscribeOutput: + if "fp16" not in kwargs: + import torch + + kwargs["fp16"] = torch.cuda.is_available() + + return self.model.transcribe(audio_array, **kwargs) + + +def recognize( + recognizer, + audio_data: AudioData, + model: str = "base", + show_dict: bool = False, + load_options: LoadModelOptionalParameters | None = None, + **transcribe_options: Unpack[TranscribeOptionalParameters], +) -> str | TranscribeOutput: + """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper. + + Pick ``model`` from output of :command:`python -c 'import whisper; print(whisper.available_models())'`. + See also https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages. + + If ``show_dict`` is true, returns the full dict response from Whisper, including the detected language. Otherwise returns only the transcription. + + You can specify: + + * ``language``: recognition language, an uncapitalized full language name like "english" or "chinese". See the full language list at https://github.com/openai/whisper/blob/main/whisper/tokenizer.py + + * If not set, Whisper will automatically detect the language. + + * ``task`` + + * If you want transcribe + **translate** to english, set ``task="translate"``. + + Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options. + """ + + import whisper + + whisper_model = whisper.load_model(model, **load_options or {}) + whisper_recognizer = WhisperCompatibleRecognizer( + TranscribableAdapter(whisper_model) + ) + return whisper_recognizer.recognize( + audio_data, show_dict=show_dict, **transcribe_options + ) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..dac56281 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +# placeholder file to make this folder a module - this allows tests in this folder to be discovered by `python -m unittest discover` diff --git a/tests/audio-mono-16-bit-44100Hz.aiff b/tests/audio-mono-16-bit-44100Hz.aiff new file mode 100644 index 00000000..0aa7b00e Binary files /dev/null and b/tests/audio-mono-16-bit-44100Hz.aiff differ diff --git a/tests/audio-mono-16-bit-44100Hz.flac b/tests/audio-mono-16-bit-44100Hz.flac new file mode 100644 index 00000000..9fdad1b8 Binary files /dev/null and b/tests/audio-mono-16-bit-44100Hz.flac differ diff --git a/tests/audio-mono-16-bit-44100Hz.wav b/tests/audio-mono-16-bit-44100Hz.wav new file mode 100644 index 00000000..976faef1 Binary files /dev/null and b/tests/audio-mono-16-bit-44100Hz.wav differ diff --git a/tests/audio-mono-24-bit-44100Hz.flac b/tests/audio-mono-24-bit-44100Hz.flac new file mode 100644 index 00000000..a5919811 Binary files /dev/null and b/tests/audio-mono-24-bit-44100Hz.flac differ diff --git a/tests/audio-mono-24-bit-44100Hz.wav b/tests/audio-mono-24-bit-44100Hz.wav new file mode 100644 index 00000000..4e11008d Binary files /dev/null and b/tests/audio-mono-24-bit-44100Hz.wav differ diff --git a/tests/audio-mono-32-bit-44100Hz.wav b/tests/audio-mono-32-bit-44100Hz.wav new file mode 100644 index 00000000..6e81d8ed Binary files /dev/null and b/tests/audio-mono-32-bit-44100Hz.wav differ diff --git a/tests/audio-mono-8-bit-44100Hz.wav b/tests/audio-mono-8-bit-44100Hz.wav new file mode 100644 index 00000000..2cfe4287 Binary files /dev/null and b/tests/audio-mono-8-bit-44100Hz.wav differ diff --git a/tests/audio-stereo-16-bit-44100Hz.aiff b/tests/audio-stereo-16-bit-44100Hz.aiff new file mode 100644 index 00000000..97dff78c Binary files /dev/null and b/tests/audio-stereo-16-bit-44100Hz.aiff differ diff --git a/tests/audio-stereo-16-bit-44100Hz.flac b/tests/audio-stereo-16-bit-44100Hz.flac new file mode 100644 index 00000000..8621d04a Binary files /dev/null and b/tests/audio-stereo-16-bit-44100Hz.flac differ diff --git a/tests/audio-stereo-16-bit-44100Hz.wav b/tests/audio-stereo-16-bit-44100Hz.wav new file mode 100644 index 00000000..58fd5078 Binary files /dev/null and b/tests/audio-stereo-16-bit-44100Hz.wav differ diff --git a/tests/audio-stereo-24-bit-44100Hz.flac b/tests/audio-stereo-24-bit-44100Hz.flac new file mode 100644 index 00000000..c4990bee Binary files /dev/null and b/tests/audio-stereo-24-bit-44100Hz.flac differ diff --git a/tests/audio-stereo-24-bit-44100Hz.wav b/tests/audio-stereo-24-bit-44100Hz.wav new file mode 100644 index 00000000..efc108e3 Binary files /dev/null and b/tests/audio-stereo-24-bit-44100Hz.wav differ diff --git a/tests/audio-stereo-32-bit-44100Hz.wav b/tests/audio-stereo-32-bit-44100Hz.wav new file mode 100644 index 00000000..dd3548bc Binary files /dev/null and b/tests/audio-stereo-32-bit-44100Hz.wav differ diff --git a/tests/audio-stereo-8-bit-44100Hz.wav b/tests/audio-stereo-8-bit-44100Hz.wav new file mode 100644 index 00000000..0a24d90e Binary files /dev/null and b/tests/audio-stereo-8-bit-44100Hz.wav differ diff --git a/tests/chinese.flac b/tests/chinese.flac new file mode 100644 index 00000000..f74764fd Binary files /dev/null and b/tests/chinese.flac differ diff --git a/tests/english.wav b/tests/english.wav new file mode 100644 index 00000000..40d7eb5c Binary files /dev/null and b/tests/english.wav differ diff --git a/tests/french.aiff b/tests/french.aiff new file mode 100644 index 00000000..31cd0d0f Binary files /dev/null and b/tests/french.aiff differ diff --git a/tests/recognizers/__init__.py b/tests/recognizers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/recognizers/test_google.py b/tests/recognizers/test_google.py new file mode 100644 index 00000000..4ee0ad1a --- /dev/null +++ b/tests/recognizers/test_google.py @@ -0,0 +1,187 @@ +from unittest import TestCase +from unittest.mock import MagicMock, patch +from urllib.request import Request + +from speech_recognition import Recognizer +from speech_recognition.audio import AudioData +from speech_recognition.recognizers import google + +MODULE_UNDER_TEST = "speech_recognition.recognizers.google" + + +class RequestBuilderTestCase(TestCase): + CLASS_UNDER_TEST = f"{MODULE_UNDER_TEST}.RequestBuilder" + + @patch(f"{MODULE_UNDER_TEST}.Request") + @patch(f"{CLASS_UNDER_TEST}.build_data") + @patch(f"{CLASS_UNDER_TEST}.build_headers") + @patch(f"{CLASS_UNDER_TEST}.build_url") + def test_build(self, build_url, build_headers, build_data, Request): + audio_data = MagicMock(spec=AudioData) + sut = google.RequestBuilder( + endpoint="", key="", language="", filter_level=0 + ) + + actual = sut.build(audio_data) + + self.assertEqual(actual, Request.return_value) + build_url.assert_called_once_with() + build_headers.assert_called_once_with(audio_data) + build_data.assert_called_once_with(audio_data) + Request.assert_called_once_with( + build_url.return_value, + data=build_data.return_value, + headers=build_headers.return_value, + ) + + @patch(f"{CLASS_UNDER_TEST}.to_convert_rate") + def test_build_data(self, to_convert_rate): + # mock has AudioData's attributes (e.g. sample_rate) + audio_data = MagicMock(spec=AudioData(None, 1, 1)) + sut = google.RequestBuilder( + endpoint="", key="", language="", filter_level=0 + ) + + actual = sut.build_data(audio_data) + + self.assertEqual(actual, audio_data.get_flac_data.return_value) + audio_data.get_flac_data.assert_called_once_with( + convert_rate=to_convert_rate.return_value, convert_width=2 + ) + to_convert_rate.assert_called_once_with(audio_data.sample_rate) + + +class OutputParserTestCase(TestCase): + CLASS_UNDER_TEST = f"{MODULE_UNDER_TEST}.OutputParser" + + @patch(f"{CLASS_UNDER_TEST}.convert_to_result") + def test_parse_show_all(self, convert_to_result): + parser = google.OutputParser(show_all=True, with_confidence=False) + + actual = parser.parse("dummy response text") + + self.assertEqual(actual, convert_to_result.return_value) + convert_to_result.assert_called_once_with("dummy response text") + + @patch(f"{CLASS_UNDER_TEST}.find_best_hypothesis") + @patch(f"{CLASS_UNDER_TEST}.convert_to_result") + def test_parse_without_confidence( + self, convert_to_result, find_best_hypothesis + ): + convert_to_result.return_value = {"alternative": "dummy"} + find_best_hypothesis.return_value = { + "transcript": "1 2", + "confidence": 0.49585345, + } + + parser = google.OutputParser(show_all=False, with_confidence=False) + actual = parser.parse("dummy response text2") + + self.assertEqual(actual, "1 2") + convert_to_result.assert_called_once_with("dummy response text2") + find_best_hypothesis.assert_called_once_with("dummy") + + @patch(f"{CLASS_UNDER_TEST}.find_best_hypothesis") + @patch(f"{CLASS_UNDER_TEST}.convert_to_result") + def test_parse_with_confidence( + self, convert_to_result, find_best_hypothesis + ): + convert_to_result.return_value = {"alternative": "dummy3"} + find_best_hypothesis.return_value = { + "transcript": "1 2", + "confidence": 0.49585345, + } + + parser = google.OutputParser(show_all=False, with_confidence=True) + actual = parser.parse("dummy response text3") + + self.assertEqual(actual, ("1 2", 0.49585345)) + find_best_hypothesis.assert_called_once_with("dummy3") + + +class ObtainTranscriptionTestCase(TestCase): + @patch(f"{MODULE_UNDER_TEST}.urlopen") + def test_obtain(self, urlopen): + request = MagicMock(spec=Request) + response = urlopen.return_value + + actual = google.obtain_transcription(request, 0) + + self.assertEqual( + actual, response.read.return_value.decode.return_value + ) + urlopen.assert_called_once_with(request, timeout=0) + response.read.assert_called_once_with() + response.read.return_value.decode.assert_called_once_with("utf-8") + + +@patch(f"{MODULE_UNDER_TEST}.OutputParser") +@patch(f"{MODULE_UNDER_TEST}.obtain_transcription") +@patch(f"{MODULE_UNDER_TEST}.create_request_builder") +class RecognizeLegacyTestCase(TestCase): + def test_default_values( + self, create_request_builder, obtain_transcription, OutputParser + ): + request_builder = create_request_builder.return_value + request = request_builder.build.return_value + response_text = obtain_transcription.return_value + output_parser = OutputParser.return_value + + # Add operation_timeout attribute by spec= + recognizer = MagicMock(spec=Recognizer()) + audio_data = MagicMock(spec=AudioData) + + actual = google.recognize_legacy(recognizer, audio_data) + + self.assertEqual(actual, output_parser.parse.return_value) + create_request_builder.assert_called_once_with( + endpoint="http://www.google.com/speech-api/v2/recognize", + key=None, + language="en-US", + filter_level=0, + ) + request_builder.build.assert_called_once_with(audio_data) + obtain_transcription.assert_called_once_with( + request, timeout=recognizer.operation_timeout + ) + OutputParser.assert_called_once_with( + show_all=False, with_confidence=False + ) + output_parser.parse.assert_called_once_with(response_text) + + def test_specified_values( + self, create_request_builder, obtain_transcription, OutputParser + ): + request_builder = create_request_builder.return_value + request = request_builder.build.return_value + response_text = obtain_transcription.return_value + output_parser = OutputParser.return_value + recognizer = MagicMock(spec=Recognizer()) + audio_data = MagicMock(spec=AudioData) + + actual = google.recognize_legacy( + recognizer, + audio_data, + key="awesome-key", + language="zh-CN", + pfilter=1, + show_all=True, + with_confidence=False, + endpoint="https://www.google.com/speech-api/v2/recognize", + ) + + self.assertEqual(actual, output_parser.parse.return_value) + create_request_builder.assert_called_once_with( + endpoint="https://www.google.com/speech-api/v2/recognize", + key="awesome-key", + language="zh-CN", + filter_level=1, + ) + request_builder.build.assert_called_once_with(audio_data) + obtain_transcription.assert_called_once_with( + request, timeout=recognizer.operation_timeout + ) + OutputParser.assert_called_once_with( + show_all=True, with_confidence=False + ) + output_parser.parse.assert_called_once_with(response_text) diff --git a/tests/recognizers/test_google_cloud.py b/tests/recognizers/test_google_cloud.py new file mode 100644 index 00000000..cca80aec --- /dev/null +++ b/tests/recognizers/test_google_cloud.py @@ -0,0 +1,182 @@ +from unittest.mock import MagicMock, patch + +from google.cloud.speech import ( + RecognitionAudio, + RecognitionConfig, + RecognizeResponse, + SpeechContext, + SpeechRecognitionAlternative, + SpeechRecognitionResult, + WordInfo, +) + +from speech_recognition import Recognizer +from speech_recognition.audio import AudioData +from speech_recognition.recognizers.google_cloud import recognize + + +@patch("google.cloud.speech.SpeechClient") +def test_transcribe_with_google_cloud_speech(SpeechClient): + client = SpeechClient.return_value + # ref: https://cloud.google.com/speech-to-text/docs/transcribe-gcloud?hl=ja#make_an_audio_transcription_request + client.recognize.return_value = RecognizeResponse( + results=[ + SpeechRecognitionResult( + alternatives=[ + SpeechRecognitionAlternative( + transcript="how old is the Brooklyn Bridge", + confidence=0.9840146, + ) + ] + ) + ] + ) + + audio_data = MagicMock(spec=AudioData) + audio_data.sample_rate = 16_000 + audio_data.get_flac_data.return_value = b"flac_data" + + actual = recognize(MagicMock(spec=Recognizer), audio_data) + + assert actual == "how old is the Brooklyn Bridge" + SpeechClient.assert_called_once_with() + client.recognize.assert_called_once_with( + config=RecognitionConfig( + encoding=RecognitionConfig.AudioEncoding.FLAC, + sample_rate_hertz=16_000, + language_code="en-US", + ), + audio=RecognitionAudio(content=b"flac_data"), + ) + + +@patch("google.cloud.speech.SpeechClient") +def test_transcribe_with_specified_credentials(SpeechClient): + client = SpeechClient.from_service_account_json.return_value + client.recognize.return_value = RecognizeResponse( + results=[ + SpeechRecognitionResult( + alternatives=[ + SpeechRecognitionAlternative( + transcript="transcript", confidence=0.9 + ) + ] + ) + ] + ) + + audio_data = MagicMock(spec=AudioData) + audio_data.sample_rate = 16_000 + audio_data.get_flac_data.return_value = b"flac_data" + + _ = recognize( + MagicMock(spec=Recognizer), + audio_data, + credentials_json_path="path/to/credentials.json", + ) + + SpeechClient.from_service_account_json.assert_called_once_with( + "path/to/credentials.json" + ) + + +@patch("google.cloud.speech.SpeechClient") +def test_transcribe_show_all(SpeechClient): + client = SpeechClient.return_value + client.recognize.return_value = RecognizeResponse( + results=[ + SpeechRecognitionResult( + alternatives=[ + SpeechRecognitionAlternative( + transcript="transcript", + confidence=0.9, + words=[ + WordInfo( + word="transcript", + start_time="0s", + end_time="0.400s", + ) + ], + ) + ], + language_code="en-US", + result_end_time="0.400s", + ) + ] + ) + + audio_data = MagicMock(spec=AudioData) + audio_data.sample_rate = 16_000 + audio_data.get_flac_data.return_value = b"flac_data" + + actual = recognize(MagicMock(spec=Recognizer), audio_data, show_all=True) + + assert actual == RecognizeResponse( + results=[ + SpeechRecognitionResult( + alternatives=[ + SpeechRecognitionAlternative( + transcript="transcript", + confidence=0.9, + words=[ + WordInfo( + word="transcript", + start_time="0s", + end_time="0.400s", + ) + ], + ) + ], + language_code="en-US", + result_end_time="0.400s", + ) + ] + ) + client.recognize.assert_called_once_with( + config=RecognitionConfig( + encoding=RecognitionConfig.AudioEncoding.FLAC, + sample_rate_hertz=16_000, + language_code="en-US", + enable_word_time_offsets=True, + ), + audio=RecognitionAudio(content=b"flac_data"), + ) + + +@patch("google.cloud.speech.SpeechClient") +def test_transcribe_with_specified_api_parameters(SpeechClient): + client = SpeechClient.return_value + client.recognize.return_value = RecognizeResponse( + results=[ + SpeechRecognitionResult( + alternatives=[ + SpeechRecognitionAlternative( + transcript="こんにちは", confidence=0.99 + ) + ] + ) + ] + ) + + audio_data = MagicMock(spec=AudioData) + audio_data.sample_rate = 16_000 + audio_data.get_flac_data.return_value = b"flac_data" + + _ = recognize( + MagicMock(spec=Recognizer), + audio_data, + language_code="ja-JP", + preferred_phrases=["numero", "hoge"], + use_enhanced=True, + ) + + client.recognize.assert_called_once_with( + config=RecognitionConfig( + encoding=RecognitionConfig.AudioEncoding.FLAC, + sample_rate_hertz=16_000, + language_code="ja-JP", + speech_contexts=[SpeechContext(phrases=["numero", "hoge"])], + use_enhanced=True, + ), + audio=RecognitionAudio(content=b"flac_data"), + ) diff --git a/tests/recognizers/whisper_api/__init__.py b/tests/recognizers/whisper_api/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/recognizers/whisper_api/test_groq.py b/tests/recognizers/whisper_api/test_groq.py new file mode 100644 index 00000000..12df2248 --- /dev/null +++ b/tests/recognizers/whisper_api/test_groq.py @@ -0,0 +1,35 @@ +from unittest.mock import MagicMock + +import httpx +import respx + +from speech_recognition import AudioData, Recognizer +from speech_recognition.recognizers.whisper_api import groq + + +@respx.mock(assert_all_called=True, assert_all_mocked=True) +def test_transcribe_with_groq_whisper(respx_mock, monkeypatch): + monkeypatch.setenv("GROQ_API_KEY", "gsk_grok_api_key") + + respx_mock.post( + "https://api.groq.com/openai/v1/audio/transcriptions", + headers__contains={"Authorization": "Bearer gsk_grok_api_key"}, + data__contains={"model": "whisper-large-v3"}, + ).mock( + return_value=httpx.Response( + 200, + json={ + "text": "Transcription by Groq Whisper", + "x_groq": {"id": "req_unique_id"}, + }, + ) + ) + + audio_data = MagicMock(spec=AudioData) + audio_data.get_wav_data.return_value = b"audio_data" + + actual = groq.recognize( + MagicMock(spec=Recognizer), audio_data, model="whisper-large-v3" + ) + + assert actual == "Transcription by Groq Whisper" diff --git a/tests/recognizers/whisper_api/test_openai.py b/tests/recognizers/whisper_api/test_openai.py new file mode 100644 index 00000000..2382ec5d --- /dev/null +++ b/tests/recognizers/whisper_api/test_openai.py @@ -0,0 +1,80 @@ +from unittest.mock import MagicMock + +import httpx +import pytest +import respx + +from speech_recognition import AudioData, Recognizer +from speech_recognition.recognizers.whisper_api import openai + + +@pytest.fixture +def setenv_openai_api_key(monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "sk_openai_api_key") + + +@respx.mock(assert_all_called=True, assert_all_mocked=True) +def test_transcribe_with_openai_whisper(respx_mock, setenv_openai_api_key): + respx_mock.post( + "https://api.openai.com/v1/audio/transcriptions", + headers__contains={"Authorization": "Bearer sk_openai_api_key"}, + data__contains={"model": "whisper-1"}, + ).mock( + return_value=httpx.Response( + 200, + json={"text": "Transcription by OpenAI Whisper"}, + ) + ) + + audio_data = MagicMock(spec=AudioData) + audio_data.get_wav_data.return_value = b"audio_data" + + actual = openai.recognize(MagicMock(spec=Recognizer), audio_data) + + assert actual == "Transcription by OpenAI Whisper" + audio_data.get_wav_data.assert_called_once() + + +@respx.mock(assert_all_called=True, assert_all_mocked=True) +def test_transcribe_with_specified_language(respx_mock, setenv_openai_api_key): + # https://github.com/Uberi/speech_recognition/issues/681 + respx_mock.post( + "https://api.openai.com/v1/audio/transcriptions", + data__contains={"language": "en"}, + ).respond( + 200, + json={"text": "English transcription"}, + ) + + audio_data = MagicMock(spec=AudioData) + audio_data.get_wav_data.return_value = b"english_audio" + + actual = openai.recognize( + MagicMock(spec=Recognizer), audio_data, language="en" + ) + + assert actual == "English transcription" + + +@respx.mock(assert_all_called=True, assert_all_mocked=True) +def test_transcribe_with_specified_prompt(respx_mock, setenv_openai_api_key): + # https://github.com/Uberi/speech_recognition/pull/676 + respx_mock.post( + "https://api.openai.com/v1/audio/transcriptions", + # ref: https://cookbook.openai.com/examples/whisper_prompting_guide + data__contains={"prompt": "Glossary: Aimee, Shawn, BBQ"}, + ).respond( + 200, + json={"text": "Prompted transcription"}, + ) + + audio_data = MagicMock(spec=AudioData) + audio_data.get_wav_data.return_value = b"audio_data" + + actual = openai.recognize( + MagicMock(spec=Recognizer), + audio_data, + prompt="Glossary: Aimee, Shawn, BBQ", + ) + + assert actual == "Prompted transcription" diff --git a/tests/recognizers/whisper_local/__init__.py b/tests/recognizers/whisper_local/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/recognizers/whisper_local/test_faster_whisper.py b/tests/recognizers/whisper_local/test_faster_whisper.py new file mode 100644 index 00000000..e2eb7632 --- /dev/null +++ b/tests/recognizers/whisper_local/test_faster_whisper.py @@ -0,0 +1,233 @@ +from __future__ import annotations + +import sys +from collections.abc import Generator +from typing import TYPE_CHECKING +from unittest.mock import ANY, MagicMock, patch + +import numpy as np +import pytest + +from speech_recognition import Recognizer +from speech_recognition.audio import AudioData +from speech_recognition.recognizers.whisper_local.faster_whisper import ( + recognize, +) + +if TYPE_CHECKING: + from faster_whisper.transcribe import Segment, TranscriptionInfo + + +@pytest.fixture +def audio_data() -> AudioData: + audio = MagicMock(spec=AudioData) + audio.get_wav_data.return_value = b"" + return audio + + +@pytest.fixture +def segment() -> Segment: + from faster_whisper.transcribe import Segment + + mocked_segment = MagicMock(spec=Segment(*[None] * 11)) + mocked_segment.text = "" + return mocked_segment + + +@pytest.fixture +def transcription_info() -> TranscriptionInfo: + from faster_whisper.transcribe import TranscriptionInfo + + return MagicMock(spec=TranscriptionInfo(*[None] * 7)) + + +@pytest.fixture +def soundfile_read() -> Generator[tuple[MagicMock, np.ndarray], None, None]: + audio_array = MagicMock(spec=np.ndarray) + dummy_sampling_rate = 99_999 + + with patch("soundfile.read") as mock_read: + mock_read.return_value = (audio_array, dummy_sampling_rate) + yield mock_read, audio_array + + +@pytest.mark.skipif( + sys.version_info >= (3, 13), reason="skip on Python 3.13 or later" +) +@patch("faster_whisper.WhisperModel") +class TestTranscribe: + def test_default_parameters( + self, WhisperModel, audio_data, soundfile_read + ): + from faster_whisper.transcribe import ( + Segment, + TranscriptionInfo, + TranscriptionOptions, + VadOptions, + ) + + sf_read, audio_array = soundfile_read + + def segments(): + yield Segment( + id=1, + seek=0, + start=0.0, + end=2.64, + text=" 1, 2, 3", + tokens=[50364, 502, 11, 568, 11, 805, 50496], + avg_logprob=-0.5378808751702309, + compression_ratio=0.4666666666666667, + no_speech_prob=0.17316274344921112, + words=None, + temperature=0.0, + ) + + info = TranscriptionInfo( + language="en", + language_probability=0.9314374923706055, + duration=2.7449375, + duration_after_vad=2.7449375, + all_language_probs=[("en", 0.9314374923706055)], # Omitted + transcription_options=MagicMock(spec=TranscriptionOptions), + vad_options=MagicMock(spec=VadOptions), + ) + + whisper_model = WhisperModel.return_value + whisper_model.transcribe.return_value = segments(), info + + audio_data.get_wav_data.return_value = b"audio data" + actual = recognize(MagicMock(spec=Recognizer), audio_data) + + assert actual == " 1, 2, 3" + WhisperModel.assert_called_once_with("base") + audio_data.get_wav_data.assert_called_once_with(convert_rate=16_000) + sf_read.assert_called_once_with(ANY) + assert sf_read.call_args[0][0].read() == b"audio data" + audio_array.astype.assert_called_once_with(np.float32) + whisper_model.transcribe.assert_called_once_with( + audio_array.astype.return_value + ) + + def test_show_dict(self, WhisperModel, audio_data, soundfile_read): + from faster_whisper.transcribe import ( + Segment, + TranscriptionInfo, + TranscriptionOptions, + VadOptions, + ) + + def segments(): + yield Segment( + id=1, + seek=0, + start=0.0, + end=2.64, + text=" 1, 2, 3", + tokens=[50364, 502, 11, 568, 11, 805, 50496], + avg_logprob=-0.5378808751702309, + compression_ratio=0.4666666666666667, + no_speech_prob=0.17316274344921112, + words=None, + temperature=0.0, + ) + + info = TranscriptionInfo( + language="en", + language_probability=0.9314374923706055, + duration=2.7449375, + duration_after_vad=2.7449375, + all_language_probs=[("en", 0.9314374923706055)], # Omitted + transcription_options=MagicMock(spec=TranscriptionOptions), + vad_options=MagicMock(spec=VadOptions), + ) + + whisper_model = WhisperModel.return_value + whisper_model.transcribe.return_value = segments(), info + + actual = recognize( + MagicMock(spec=Recognizer), audio_data, show_dict=True + ) + + expected = { + "text": " 1, 2, 3", + "language": "en", + "segments": [ + Segment( + id=1, + seek=0, + start=0.0, + end=2.64, + text=" 1, 2, 3", + tokens=[50364, 502, 11, 568, 11, 805, 50496], + avg_logprob=-0.5378808751702309, + compression_ratio=0.4666666666666667, + no_speech_prob=0.17316274344921112, + words=None, + temperature=0.0, + ) + ], + } + assert actual == expected + + def test_pass_parameters( + self, + WhisperModel, + audio_data, + segment, + transcription_info, + soundfile_read, + ): + _, audio_array = soundfile_read + + def segments_generator(): + yield segment + + whisper_model = WhisperModel.return_value + whisper_model.transcribe.return_value = ( + segments_generator(), + transcription_info, + ) + + _ = recognize( + MagicMock(spec=Recognizer), + audio_data, + model="small", + show_dict=True, + language="fr", + task="translate", + beam_size=5, + ) + + WhisperModel.assert_called_once_with("small") + whisper_model.transcribe.assert_called_once_with( + audio_array.astype.return_value, + language="fr", + task="translate", + beam_size=5, + ) + + def test_init_parameters( + self, + WhisperModel, + audio_data, + segment, + transcription_info, + soundfile_read, + ): + def segments_generator(): + yield segment + + whisper_model = WhisperModel.return_value + whisper_model.transcribe.return_value = ( + segments_generator(), + transcription_info, + ) + + _ = recognize( + MagicMock(spec=Recognizer), + audio_data, + init_options={"compute_type": "int8"}, + ) + + WhisperModel.assert_called_once_with("base", compute_type="int8") diff --git a/tests/recognizers/whisper_local/test_whisper.py b/tests/recognizers/whisper_local/test_whisper.py new file mode 100644 index 00000000..5fbb6f4c --- /dev/null +++ b/tests/recognizers/whisper_local/test_whisper.py @@ -0,0 +1,117 @@ +import sys +from unittest import TestCase, skipIf +from unittest.mock import ANY, MagicMock, patch + +import numpy as np + +from speech_recognition import AudioData, Recognizer +from speech_recognition.recognizers.whisper_local.whisper import recognize + + +@skipIf(sys.version_info >= (3, 13), "skip on Python 3.13") +@patch("soundfile.read") +@patch("torch.cuda.is_available") +@patch("whisper.load_model") +class RecognizeWhisperTestCase(TestCase): + def test_default_parameters(self, load_model, is_available, sf_read): + whisper_model = load_model.return_value + whisper_model.transcribe.return_value = { + "text": "Transcription by Whisper model", + "language": "en", + # Omit "segments" + } + audio_array = MagicMock(spec=np.ndarray) + dummy_sampling_rate = 99_999 + sf_read.return_value = (audio_array, dummy_sampling_rate) + + audio_data = MagicMock(spec=AudioData) + audio_data.get_wav_data.return_value = b"wav_data" + actual = recognize(MagicMock(spec=Recognizer), audio_data) + + assert actual == "Transcription by Whisper model" + load_model.assert_called_once_with("base") + audio_data.get_wav_data.assert_called_once_with(convert_rate=16000) + sf_read.assert_called_once_with(ANY) + assert sf_read.call_args[0][0].read() == b"wav_data" + audio_array.astype.assert_called_once_with(np.float32) + whisper_model.transcribe.assert_called_once_with( + audio_array.astype.return_value, + fp16=is_available.return_value, + ) + + def test_return_as_dict(self, load_model, is_available, sf_read): + whisper_model = load_model.return_value + whisper_model.transcribe.return_value = { + "text": " 1, 2, 3", + "segments": [ + { + "id": 0, + "seek": 0, + "start": 0.0, + "end": 2.64, + "text": " 1, 2, 3", + "tokens": [50364, 502, 11, 568, 11, 805, 50496], + "temperature": 0.0, + "avg_logprob": -0.5379014015197754, + "compression_ratio": 0.4666666666666667, + "no_speech_prob": 0.17316073179244995, + } + ], + "language": "en", + } + audio_array = MagicMock(spec=np.ndarray) + dummy_sampling_rate = 99_999 + sf_read.return_value = (audio_array, dummy_sampling_rate) + + audio_data = MagicMock(spec=AudioData) + audio_data.get_wav_data.return_value = b"" + actual = recognize( + MagicMock(spec=Recognizer), audio_data, show_dict=True + ) + + expected = { + "text": " 1, 2, 3", + "segments": [ + { + "id": 0, + "seek": 0, + "start": 0.0, + "end": 2.64, + "text": " 1, 2, 3", + "tokens": [50364, 502, 11, 568, 11, 805, 50496], + "temperature": 0.0, + "avg_logprob": -0.5379014015197754, + "compression_ratio": 0.4666666666666667, + "no_speech_prob": 0.17316073179244995, + } + ], + "language": "en", + } + + assert actual == expected + + def test_pass_parameters(self, load_model, is_available, sf_read): + whisper_model = load_model.return_value + audio_array = MagicMock(spec=np.ndarray) + dummy_sampling_rate = 99_999 + sf_read.return_value = (audio_array, dummy_sampling_rate) + + audio_data = MagicMock(spec=AudioData) + audio_data.get_wav_data.return_value = b"" + _ = recognize( + MagicMock(spec=Recognizer), + audio_data, + model="small", + language="english", + task="translate", + temperature=0, + ) + + load_model.assert_called_once_with("small") + whisper_model.transcribe.assert_called_once_with( + audio_array.astype.return_value, + fp16=is_available.return_value, + language="english", + task="translate", + temperature=0, + ) diff --git a/tests/test_audio.py b/tests/test_audio.py new file mode 100644 index 00000000..0e195157 --- /dev/null +++ b/tests/test_audio.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 + +import unittest +from os import path + +import speech_recognition as sr + + +class TestAudioFile(unittest.TestCase): + def assertSimilar(self, bytes_1, bytes_2): + for i, (byte_1, byte_2) in enumerate(zip(bytes_1, bytes_2)): + if abs(byte_1 - byte_2) > 2: + raise AssertionError("{} is really different from {} at index {}".format(bytes_1, bytes_2, i)) + + def test_get_segment(self): + r = sr.Recognizer() + with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-32-bit-44100Hz.wav")) as source: audio = r.record(source) + self.assertEqual(audio.get_raw_data(), audio.get_segment().get_raw_data()) + self.assertEqual(audio.get_raw_data()[8:], audio.get_segment(0.022675738 * 2).get_raw_data()) + self.assertEqual(audio.get_raw_data()[:16], audio.get_segment(None, 0.022675738 * 4).get_raw_data()) + self.assertEqual(audio.get_raw_data()[8:16], audio.get_segment(0.022675738 * 2, 0.022675738 * 4).get_raw_data()) + + def test_wav_mono_8_bit(self): + r = sr.Recognizer() + with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-8-bit-44100Hz.wav")) as source: audio = r.record(source) + self.assertIsInstance(audio, sr.AudioData) + self.assertEqual(audio.sample_rate, 44100) + self.assertEqual(audio.sample_width, 1) + self.assertSimilar(audio.get_raw_data()[:32], b"\x00\xff\x00\xff\x00\xff\xff\x00\xff\x00\xff\x00\xff\x00\x00\xff\x00\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\xff") + + def test_wav_mono_16_bit(self): + r = sr.Recognizer() + with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-16-bit-44100Hz.wav")) as source: audio = r.record(source) + self.assertIsInstance(audio, sr.AudioData) + self.assertEqual(audio.sample_rate, 44100) + self.assertEqual(audio.sample_width, 2) + self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\xff\xff\x01\x00\xff\xff\x00\x00\x01\x00\xfe\xff\x01\x00\xfe\xff\x04\x00\xfc\xff\x04\x00\xfe\xff\xff\xff\x03\x00\xfe\xff") + + def test_wav_mono_24_bit(self): + r = sr.Recognizer() + with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-24-bit-44100Hz.wav")) as source: audio = r.record(source) + self.assertIsInstance(audio, sr.AudioData) + self.assertEqual(audio.sample_rate, 44100) + if audio.sample_width == 3: + self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\xff\xff\x00\x01\x00\x00\xff\xff\x00\x00\x00\x00\x01\x00\x00\xfe\xff\x00\x01\x00\x00\xfe\xff\x00\x04\x00\x00\xfb") + else: + self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x01\x00\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\xfe\xff\x00\x00\x01\x00") + + def test_wav_mono_32_bit(self): + r = sr.Recognizer() + audio_file_path = path.join(path.dirname(path.realpath(__file__)), "audio-mono-32-bit-44100Hz.wav") + with sr.AudioFile(audio_file_path) as source: audio = r.record(source) + self.assertIsInstance(audio, sr.AudioData) + self.assertEqual(audio.sample_rate, 44100) + self.assertEqual(audio.sample_width, 4) + self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x01\x00\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\xfe\xff\x00\x00\x01\x00") + + def test_wav_stereo_8_bit(self): + r = sr.Recognizer() + with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-8-bit-44100Hz.wav")) as source: audio = r.record(source) + self.assertIsInstance(audio, sr.AudioData) + self.assertEqual(audio.sample_rate, 44100) + self.assertEqual(audio.sample_width, 1) + self.assertSimilar(audio.get_raw_data()[:32], b"\x00\xff\x00\xff\x00\x00\xff\x7f\x7f\x00\xff\x00\xff\x00\x00\xff\x00\x7f\x7f\x7f\x00\x00\xff\x00\xff\x00\xff\x00\x7f\x7f\x7f\x7f") + + def test_wav_stereo_16_bit(self): + r = sr.Recognizer() + with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-16-bit-44100Hz.wav")) as source: audio = r.record(source) + self.assertIsInstance(audio, sr.AudioData) + self.assertEqual(audio.sample_rate, 44100) + self.assertEqual(audio.sample_width, 2) + self.assertSimilar(audio.get_raw_data()[:32], b"\x02\x00\xfb\xff\x04\x00\xfe\xff\xfe\xff\x07\x00\xf6\xff\x07\x00\xf9\xff\t\x00\xf5\xff\x0c\x00\xf8\xff\x02\x00\x04\x00\xfa\xff") + + def test_wav_stereo_24_bit(self): + r = sr.Recognizer() + with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-24-bit-44100Hz.wav")) as source: audio = r.record(source) + self.assertIsInstance(audio, sr.AudioData) + self.assertEqual(audio.sample_rate, 44100) + if audio.sample_width == 3: + self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\xfe\xff\x00\x02\x00\x00\xfe\xff\x00\x00\x00\x00\x02\x00\x00\xfc\xff\x00\x02\x00\x00\xfc\xff\x00\x08\x00\x00\xf6") + else: + self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\x00\x00\xfe\xff\x00\x00\x02\x00\x00\x00\xfe\xff\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\xfc\xff\x00\x00\x02\x00") + + def test_wav_stereo_32_bit(self): + r = sr.Recognizer() + with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-32-bit-44100Hz.wav")) as source: audio = r.record(source) + self.assertIsInstance(audio, sr.AudioData) + self.assertEqual(audio.sample_rate, 44100) + self.assertEqual(audio.sample_width, 4) + self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\x00\x00\xfe\xff\x00\x00\x02\x00\x00\x00\xfe\xff\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\xfc\xff\x00\x00\x02\x00") + + def test_aiff_mono_16_bit(self): + r = sr.Recognizer() + with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-16-bit-44100Hz.aiff")) as source: audio = r.record(source) + self.assertIsInstance(audio, sr.AudioData) + self.assertEqual(audio.sample_rate, 44100) + self.assertEqual(audio.sample_width, 2) + self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\xff\xff\x01\x00\xff\xff\x01\x00\xfe\xff\x02\x00\xfd\xff\x04\x00\xfc\xff\x03\x00\x00\x00\xfe\xff\x03\x00\xfd\xff") + + def test_aiff_stereo_16_bit(self): + r = sr.Recognizer() + with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-16-bit-44100Hz.aiff")) as source: audio = r.record(source) + self.assertIsInstance(audio, sr.AudioData) + self.assertEqual(audio.sample_rate, 44100) + self.assertEqual(audio.sample_width, 2) + self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\xfe\xff\x02\x00\xfe\xff\xff\xff\x04\x00\xfa\xff\x04\x00\xfa\xff\t\x00\xf6\xff\n\x00\xfa\xff\xff\xff\x08\x00\xf5\xff") + + def test_flac_mono_16_bit(self): + r = sr.Recognizer() + with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-16-bit-44100Hz.flac")) as source: audio = r.record(source) + self.assertIsInstance(audio, sr.AudioData) + self.assertEqual(audio.sample_rate, 44100) + self.assertEqual(audio.sample_width, 2) + self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\xff\xff\x01\x00\xff\xff\x00\x00\x01\x00\xfe\xff\x02\x00\xfc\xff\x06\x00\xf9\xff\x06\x00\xfe\xff\xfe\xff\x05\x00\xfa\xff") + + def test_flac_mono_24_bit(self): + r = sr.Recognizer() + with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-24-bit-44100Hz.flac")) as source: audio = r.record(source) + self.assertIsInstance(audio, sr.AudioData) + self.assertEqual(audio.sample_rate, 44100) + if audio.sample_width == 3: + self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\xff\xfe\xff\x02\x01\x00\xfd\xfe\xff\x04\x00\x00\xfc\x00\x00\x04\xfe\xff\xfb\x00\x00\x05\xfe\xff\xfc\x03\x00\x04\xfb") + else: + self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\x00\xff\xfe\xff\x00\x02\x01\x00\x00\xfd\xfe\xff\x00\x04\x00\x00\x00\xfc\x00\x00\x00\x04\xfe\xff\x00\xfb\x00\x00") + + def test_flac_stereo_16_bit(self): + r = sr.Recognizer() + with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-16-bit-44100Hz.flac")) as source: audio = r.record(source) + self.assertIsInstance(audio, sr.AudioData) + self.assertEqual(audio.sample_rate, 44100) + self.assertEqual(audio.sample_width, 2) + self.assertSimilar(audio.get_raw_data()[:32], b"\xff\xff\xff\xff\x02\x00\xfe\xff\x00\x00\x01\x00\xfd\xff\x01\x00\xff\xff\x04\x00\xfa\xff\x05\x00\xff\xff\xfd\xff\x08\x00\xf6\xff") + + def test_flac_stereo_24_bit(self): + r = sr.Recognizer() + with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-24-bit-44100Hz.flac")) as source: audio = r.record(source) + self.assertIsInstance(audio, sr.AudioData) + self.assertEqual(audio.sample_rate, 44100) + if audio.sample_width == 3: + self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\xfe\xff\x00\x02\x00\x00\xfe\xff\x00\x00\x00\xff\x01\x00\x02\xfc\xff\xfe\x01\x00\x02\xfc\xff\xfe\x07\x00\x01\xf6") + else: + self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\x00\x00\xfe\xff\x00\x00\x02\x00\x00\x00\xfe\xff\x00\x00\x00\x00\x00\xff\x01\x00\x00\x02\xfc\xff\x00\xfe\x01\x00") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_recognition.py b/tests/test_recognition.py new file mode 100644 index 00000000..90c17521 --- /dev/null +++ b/tests/test_recognition.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import sys +import unittest + +import speech_recognition as sr + + +class TestRecognition(unittest.TestCase): + def setUp(self): + self.AUDIO_FILE_EN = os.path.join(os.path.dirname(os.path.realpath(__file__)), "english.wav") + self.AUDIO_FILE_FR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "french.aiff") + self.AUDIO_FILE_ZH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "chinese.flac") + + def test_recognizer_attributes(self): + r = sr.Recognizer() + attributes = set(dir(r)) + + self.assertEqual(r.energy_threshold, 300) + self.assertTrue(r.dynamic_energy_threshold) + self.assertEqual(r.dynamic_energy_adjustment_damping, 0.15) + self.assertEqual(r.dynamic_energy_ratio, 1.5) + self.assertEqual(r.pause_threshold, 0.8) + self.assertIsNone(r.operation_timeout) + self.assertEqual(r.phrase_threshold, 0.3) + self.assertEqual(r.non_speaking_duration, 0.5) + # https://github.com/Uberi/speech_recognition/issues/743 + self.assertTrue("recognize_google" in attributes) + + @unittest.skipIf(sys.platform.startswith("win"), "skip on Windows") + def test_sphinx_english(self): + r = sr.Recognizer() + with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source) + self.assertEqual(r.recognize_sphinx(audio), "one two three") + + @unittest.skipUnless("WIT_AI_KEY" in os.environ, "requires Wit.ai key to be specified in WIT_AI_KEY environment variable") + def test_wit_english(self): + r = sr.Recognizer() + with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source) + self.assertEqual(r.recognize_wit(audio, key=os.environ["WIT_AI_KEY"]), "one two three") + + @unittest.skipUnless("BING_KEY" in os.environ, "requires Microsoft Bing Voice Recognition key to be specified in BING_KEY environment variable") + def test_bing_english(self): + r = sr.Recognizer() + with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source) + self.assertEqual(r.recognize_bing(audio, key=os.environ["BING_KEY"]), "123.") + + @unittest.skipUnless("BING_KEY" in os.environ, "requires Microsoft Bing Voice Recognition key to be specified in BING_KEY environment variable") + def test_bing_french(self): + r = sr.Recognizer() + with sr.AudioFile(self.AUDIO_FILE_FR) as source: audio = r.record(source) + self.assertEqual(r.recognize_bing(audio, key=os.environ["BING_KEY"], language="fr-FR"), u"Essaye la dictée numéro un.") + + @unittest.skipUnless("BING_KEY" in os.environ, "requires Microsoft Bing Voice Recognition key to be specified in BING_KEY environment variable") + def test_bing_chinese(self): + r = sr.Recognizer() + with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source) + self.assertEqual(r.recognize_bing(audio, key=os.environ["BING_KEY"], language="zh-CN"), u"砸自己的脚。") + + @unittest.skipUnless("HOUNDIFY_CLIENT_ID" in os.environ and "HOUNDIFY_CLIENT_KEY" in os.environ, "requires Houndify client ID and client key to be specified in HOUNDIFY_CLIENT_ID and HOUNDIFY_CLIENT_KEY environment variables") + def test_houndify_english(self): + r = sr.Recognizer() + with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source) + self.assertEqual(r.recognize_houndify(audio, client_id=os.environ["HOUNDIFY_CLIENT_ID"], client_key=os.environ["HOUNDIFY_CLIENT_KEY"]), "one two three") + + @unittest.skipUnless("IBM_USERNAME" in os.environ and "IBM_PASSWORD" in os.environ, "requires IBM Speech to Text username and password to be specified in IBM_USERNAME and IBM_PASSWORD environment variables") + def test_ibm_english(self): + r = sr.Recognizer() + with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source) + self.assertEqual(r.recognize_ibm(audio, username=os.environ["IBM_USERNAME"], password=os.environ["IBM_PASSWORD"]), "one two three ") + + @unittest.skipUnless("IBM_USERNAME" in os.environ and "IBM_PASSWORD" in os.environ, "requires IBM Speech to Text username and password to be specified in IBM_USERNAME and IBM_PASSWORD environment variables") + def test_ibm_french(self): + r = sr.Recognizer() + with sr.AudioFile(self.AUDIO_FILE_FR) as source: audio = r.record(source) + self.assertEqual(r.recognize_ibm(audio, username=os.environ["IBM_USERNAME"], password=os.environ["IBM_PASSWORD"], language="fr-FR"), u"si la dictée numéro un ") + + @unittest.skipUnless("IBM_USERNAME" in os.environ and "IBM_PASSWORD" in os.environ, "requires IBM Speech to Text username and password to be specified in IBM_USERNAME and IBM_PASSWORD environment variables") + def test_ibm_chinese(self): + r = sr.Recognizer() + with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source) + self.assertEqual(r.recognize_ibm(audio, username=os.environ["IBM_USERNAME"], password=os.environ["IBM_PASSWORD"], language="zh-CN"), u"砸 自己 的 脚 ") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_special_features.py b/tests/test_special_features.py new file mode 100644 index 00000000..3038fe36 --- /dev/null +++ b/tests/test_special_features.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import sys +import unittest + +import speech_recognition as sr + + +class TestSpecialFeatures(unittest.TestCase): + def setUp(self): + self.AUDIO_FILE_EN = os.path.join(os.path.dirname(os.path.realpath(__file__)), "english.wav") + self.addTypeEqualityFunc(str, self.assertSameWords) + + @unittest.skipIf(sys.platform.startswith("win"), "skip on Windows") + def test_sphinx_keywords(self): + r = sr.Recognizer() + with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source) + self.assertEqual(r.recognize_sphinx(audio, keyword_entries=[("one", 1.0), ("two", 1.0), ("three", 1.0)]), "three two one") + # pocketsphinx < 5 recognizes tree but pocketsphinx >= 5 ignores it (TODO need to research why) + self.assertEqual(r.recognize_sphinx(audio, keyword_entries=[("wan", 0.95), ("too", 1.0), ("tree", 1.0)]), "too wan") + # pocketsphinx < 5 recognizes tee but pocketsphinx >= 5 ignores it (TODO need to research why) + self.assertEqual(r.recognize_sphinx(audio, keyword_entries=[("un", 0.95), ("to", 1.0), ("tee", 1.0)]), "to un") + + def assertSameWords(self, tested, reference, msg=None): + set_tested = set(tested.split()) + set_reference = set(reference.split()) + if set_tested != set_reference: + raise self.failureException(msg if msg is not None else "%r doesn't consist of the same words as %r" % (tested, reference)) + + +if __name__ == "__main__": + unittest.main() diff --git a/third-party/Compiling Python extensions on Windows.pdf b/third-party/Compiling Python extensions on Windows.pdf deleted file mode 100644 index 2278e483..00000000 Binary files a/third-party/Compiling Python extensions on Windows.pdf and /dev/null differ diff --git a/third-party/LICENSE-PyAudio.txt b/third-party/LICENSE-PyAudio.txt deleted file mode 100644 index c1212270..00000000 --- a/third-party/LICENSE-PyAudio.txt +++ /dev/null @@ -1,7 +0,0 @@ -Copyright (c) 2006 Hubert Pham - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/third-party/LICENSE-Sphinx.txt b/third-party/LICENSE-Sphinx.txt deleted file mode 100644 index d5595525..00000000 --- a/third-party/LICENSE-Sphinx.txt +++ /dev/null @@ -1,31 +0,0 @@ -Copyright (c) 1999-2015 Carnegie Mellon University. All rights -reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - -This work was supported in part by funding from the Defense Advanced -Research Projects Agency and the National Science Foundation of the -United States of America, and the CMU Sphinx Speech Consortium. - -THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND -ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY -NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - \ No newline at end of file diff --git a/third-party/PyAudio-0.2.9-cp27-none-win_amd64.whl b/third-party/PyAudio-0.2.9-cp27-none-win_amd64.whl deleted file mode 100644 index f1dacc1e..00000000 Binary files a/third-party/PyAudio-0.2.9-cp27-none-win_amd64.whl and /dev/null differ diff --git a/third-party/PyAudio-0.2.9-cp34-none-win_amd64.whl b/third-party/PyAudio-0.2.9-cp34-none-win_amd64.whl deleted file mode 100644 index 568379df..00000000 Binary files a/third-party/PyAudio-0.2.9-cp34-none-win_amd64.whl and /dev/null differ diff --git a/third-party/PyAudio-0.2.9-cp35-none-win_amd64.whl b/third-party/PyAudio-0.2.9-cp35-none-win_amd64.whl deleted file mode 100644 index 9eb44480..00000000 Binary files a/third-party/PyAudio-0.2.9-cp35-none-win_amd64.whl and /dev/null differ diff --git a/third-party/PyAudio-0.2.9.tar.gz b/third-party/PyAudio-0.2.9.tar.gz deleted file mode 100644 index 933642d5..00000000 Binary files a/third-party/PyAudio-0.2.9.tar.gz and /dev/null differ diff --git a/third-party/flac-1.3.2.tar.xz b/third-party/flac-1.3.2.tar.xz new file mode 100644 index 00000000..5b9c69af Binary files /dev/null and b/third-party/flac-1.3.2.tar.xz differ diff --git a/third-party/irstlm-master.zip b/third-party/irstlm-master.zip index 5d54b3b0..c1f92640 100644 Binary files a/third-party/irstlm-master.zip and b/third-party/irstlm-master.zip differ diff --git a/third-party/pocketsphinx-0.0.9-cp27-none-win_amd64.whl b/third-party/pocketsphinx-0.0.9-cp27-none-win_amd64.whl deleted file mode 100644 index ea72c8fc..00000000 Binary files a/third-party/pocketsphinx-0.0.9-cp27-none-win_amd64.whl and /dev/null differ diff --git a/third-party/pocketsphinx-0.0.9-cp34-none-win_amd64.whl b/third-party/pocketsphinx-0.0.9-cp34-none-win_amd64.whl deleted file mode 100644 index 20517e8c..00000000 Binary files a/third-party/pocketsphinx-0.0.9-cp34-none-win_amd64.whl and /dev/null differ diff --git a/third-party/pocketsphinx-0.0.9-cp35-none-win_amd64.whl b/third-party/pocketsphinx-0.0.9-cp35-none-win_amd64.whl deleted file mode 100644 index 08889de9..00000000 Binary files a/third-party/pocketsphinx-0.0.9-cp35-none-win_amd64.whl and /dev/null differ diff --git a/third-party/pocketsphinx-0.0.9.zip b/third-party/pocketsphinx-0.0.9.zip deleted file mode 100644 index ce2cced4..00000000 Binary files a/third-party/pocketsphinx-0.0.9.zip and /dev/null differ