switch to faster-whisper and upgrade deltabot-cli

deltachat-bot · Mar 22, 2024 · 792f617 · 792f617
1 parent 6097b6c
commit 792f617
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 69 deletions.
diff --git a/README.md b/README.md
@@ -12,14 +12,8 @@ A voice-to-text converter bot for Delta Chat.
 pip install voice2text-deltabot
 ```
 
-The bot uses [Whisper](https://github.com/openai/whisper) to extract the text from voice messages,
-Whisper requires the command-line tool `ffmpeg` to be installed on your system, which is available
-from most package managers:
-
-```sh
-# on Ubuntu or Debian
-sudo apt update && sudo apt install ffmpeg
-```
+The bot uses [Faster Whisper](https://github.com/guillaumekln/faster-whisper/) to extract the text
+from voice messages.
 
 ## Usage
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,26 +1,25 @@
 [build-system]
-requires = ["setuptools"]
+requires = ["setuptools>=64", "setuptools_scm>=8"]
 build-backend = "setuptools.build_meta"
 
 [project]
-version = "0.1.3"
 name = "voice2text-deltabot"
 description = "Delta Chat bot to extract text from voice messages"
+dynamic = ["version"]
 readme = "README.md"
 requires-python = ">=3.8"
-license = {file = "LICENSE.txt"}
 keywords = ["deltachat", "bot"]
 authors = [
-  {name = "adbenitez", email = "[email protected]"},
+  {name = "adbenitez", email = "[email protected]"},
 ]
 classifiers = [
   "Development Status :: 4 - Beta",
-  "Programming Language :: Python"
+  "Programming Language :: Python",
+  "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
 ]
 dependencies = [
-    "deltabot-cli>=0.2.0",
-    "deltachat-rpc-server>=1.127.0",
-    "openai-whisper",
+    "deltabot-cli>=5.0.0,<6.0",
+    "faster-whisper",
 ]
 
 [project.optional-dependencies]
@@ -36,6 +35,9 @@ dev = [
 [project.scripts]
 voice2text-bot = "voice2text_deltabot:main"
 
+[tool.setuptools_scm]
+# can be empty if no extra settings are needed, presence enables setuptools_scm
+
 [tool.isort]
 profile = "black"
 

diff --git a/voice2text_deltabot/const.py b/voice2text_deltabot/const.py
diff --git a/voice2text_deltabot/hooks.py b/voice2text_deltabot/hooks.py
@@ -1,54 +1,79 @@
 """Event handlers and hooks."""
 
-import logging
 from argparse import Namespace
 
-import whisper
-from deltabot_cli import AttrDict, Bot, BotCli, EventType, const, events
-
-from .const import MODEL_CFG_KEY
-from .subcommands import add_subcommands
+from deltabot_cli import AttrDict, Bot, BotCli, ChatType, EventType, ViewType, events
+from faster_whisper import WhisperModel
 
 cli = BotCli("voice2text-bot")
-add_subcommands(cli)
-STATUS = "I am a Delta Chat bot, send me any voice message to convert it to text"
-MODEL: whisper.Whisper = None  # noqa
+cli.add_generic_option(
+    "--model",
+    help="set the whisper model to use, for example: small, medium, large. (default: %(default)s)",
+    default="large",
+)
+cli.add_generic_option(
+    "--device",
+    help="set the device type (default: %(default)s)",
+    choices=["cuda", "cpu"],
+    default="cpu",
+)
+cli.add_generic_option(
+    "--compute-type",
+    help="set the compute type (default: %(default)s)",
+    choices=["int8", "float16", "int8_float16"],
+    default="int8",
+)
+STATUS = "I'm a Delta Chat bot, send me any voice message to convert it to text"
+MODEL: WhisperModel = None  # noqa
 
 
 @cli.on_init
-def on_init(bot: Bot, _args: Namespace) -> None:
-    if not bot.account.get_config("displayname"):
-        bot.account.set_config("displayname", "Voice To Text")
-        bot.account.set_config("selfstatus", STATUS)
+def _on_init(bot: Bot, _args: Namespace) -> None:
+    for accid in bot.rpc.get_all_account_ids():
+        if not bot.rpc.get_config(accid, "displayname"):
+            bot.rpc.set_config(accid, "displayname", "Voice To Text")
+            bot.rpc.set_config(accid, "selfstatus", STATUS)
+            bot.rpc.set_config(accid, "delete_server_after", "1")
+            bot.rpc.set_config(accid, "delete_device_after", str(60 * 60 * 24))
 
 
 @cli.on_start
-def on_start(bot: Bot, _args: Namespace) -> None:
+def on_start(_bot: Bot, args: Namespace) -> None:
     global MODEL  # pylint: disable=W0603
-    model = bot.account.get_config(MODEL_CFG_KEY) or "medium"
-    MODEL = whisper.load_model(model)
+    MODEL = WhisperModel(args.model, device=args.device, compute_type=args.compute_type)
 
 
 @cli.on(events.RawEvent)
-def log_event(event: AttrDict) -> None:
+def _log_event(bot: Bot, accid: int, event: AttrDict) -> None:
     if event.kind == EventType.INFO:
-        logging.info(event.msg)
+        bot.logger.debug(event.msg)
     elif event.kind == EventType.WARNING:
-        logging.warning(event.msg)
+        bot.logger.warning(event.msg)
     elif event.kind == EventType.ERROR:
-        logging.error(event.msg)
+        bot.logger.error(event.msg)
+    elif event.kind == EventType.SECUREJOIN_INVITER_PROGRESS:
+        if event.progress == 1000:
+            bot.logger.debug("QR scanned by contact id=%s", event.contact_id)
+            chatid = bot.rpc.create_chat_by_contact_id(accid, event.contact_id)
+            bot.rpc.send_msg(accid, chatid, {"text": STATUS})
 
 
-@cli.on(events.NewMessage(is_info=False))
-def on_newmsg(event: AttrDict) -> None:
-    msg = event.message_snapshot
-    if msg.view_type in (const.ViewType.VOICE, const.ViewType.AUDIO):
-        result = MODEL.transcribe(msg.file)
-        msg.chat.send_message(text=result["text"], quoted_msg=msg.id)
-        return
+@cli.on(events.NewMessage(is_info=False, is_bot=None))
+def on_newmsg(bot: Bot, accid: int, event: AttrDict) -> None:
+    msg = event.msg
+    chat = bot.rpc.get_basic_chat_info(accid, msg.chat_id)
+    if chat.chat_type == ChatType.SINGLE:
+        bot.rpc.markseen_msgs(accid, [msg.id])
+        if msg.is_bot:
+            return
 
-    chat = event.message_snapshot.chat.get_basic_snapshot()
-    if chat.chat_type == const.ChatType.SINGLE:
-        event.message_snapshot.chat.send_message(
-            text=STATUS, quoted_msg=event.message_snapshot.id
+    if msg.view_type in (ViewType.VOICE, ViewType.AUDIO):
+        segments, info = MODEL.transcribe(msg.file)
+        bot.logger.info(
+            f"[chat={msg.chat_id}, msg={msg.id}] Detected language"
+            f" {info.language!r} with probability {info.language_probability}"
         )
+        text = " ".join(seg.text for seg in segments)
+        bot.rpc.send_msg(accid, msg.chat_id, {"text": text, "quotedMessageId": msg.id})
+    elif chat.chat_type == ChatType.SINGLE:
+        bot.rpc.send_msg(accid, msg.chat_id, {"text": STATUS})
diff --git a/voice2text_deltabot/subcommands.py b/voice2text_deltabot/subcommands.py