pipecat-ai · chadbailey59 · Apr 3, 2024 · Apr 3, 2024 · Apr 3, 2024 · Apr 4, 2024
diff --git a/examples/server/fly-server.py b/examples/server/fly-server.py
@@ -0,0 +1,201 @@
+import os
+import requests
+import urllib
+import subprocess
+import time
+
+from flask import Flask, jsonify, redirect, request
+from flask_cors import CORS
+
+from dotenv import load_dotenv
+load_dotenv(override=True)
+
+app = Flask(__name__)
+CORS(app)
+
+APPS = {
+    "chatbot": "examples/starter-apps/chatbot.py",
+    "patient-intake": "examples/starter-apps/patient-intake.py",
+    "storybot": "examples/starter-apps/storybot.py",
+    "translator": "examples/starter-apps/translator.py"
+}
+
+daily_api_key = os.getenv("DAILY_API_KEY")
+api_path = os.getenv("DAILY_API_PATH") or "https://api.daily.co/v1"
+fly_api_key = os.getenv("FLY_API_KEY")
+fly_app_name = os.getenv("FLY_APP_NAME")
+fly_headers = {
+    'Authorization': f"Bearer {fly_api_key}",
+    'Content-Type': 'application/json'
+}
+fly_api_host = "https://api.machines.dev/v1"
+
+# grab the first machine image for lauching bots
+res = requests.get(f"{fly_api_host}/apps/{fly_app_name}/machines", headers=fly_headers)
+if res.status_code != 200:
+    raise Exception(f"Unable to get machine info from Fly: {res.text}")
+image = res.json()[0]['config']['image']
+print(f"Image is: {image}")
+
+
+def get_room_name(room_url):
+    return urllib.parse.urlparse(room_url).path[1:]
+
+
+def create_room(room_properties, exp):
+    room_props = {
+        "exp": exp,
+        "enable_chat": True,
+        "enable_emoji_reactions": True,
+        "eject_at_room_exp": True,
+        "enable_prejoin_ui": False,
+        "enable_recording": "cloud"
+    }
+    if room_properties:
+        room_props |= room_properties
+
+    res = requests.post(
+        f"{api_path}/rooms",
+        headers={"Authorization": f"Bearer {daily_api_key}"},
+        json={
+            "properties": room_props
+        },
+    )
+    if res.status_code != 200:
+        raise Exception(f"Unable to create room: {res.text}")
+
+    room_url = res.json()["url"]
+    room_name = res.json()["name"]
+    return (room_url, room_name)
+
+
+def create_token(room_name, token_properties, exp):
+    token_props = {"exp": exp, "is_owner": True}
+    if token_properties:
+        token_props |= token_properties
+    # Force the token to be limited to the room
+    token_props |= {"room_name": room_name}
+    res = requests.post(
+        f'{api_path}/meeting-tokens',
+        headers={
+            'Authorization': f'Bearer {daily_api_key}'},
+        json={
+            'properties': token_props})
+    if res.status_code != 200:
+        if res.status_code != 200:
+            raise Exception(f"Unable to create meeting token: {res.text}")
+
+    meeting_token = res.json()['token']
+    return meeting_token
+
+
+def start_bot(*, bot_path, room_url, token, bot_args, wait_for_bot):
+
+    room_name = get_room_name(room_url)
+    # proc = subprocess.Popen(
+    #     [f"python {bot_path} -u {room_url} -t {token} -k {daily_api_key} {bot_args}"],
+    #     shell=True,
+    #     bufsize=1,
+    # )
+    cmd = f"python {bot_path} -u {room_url} -t {token} -k {daily_api_key} {bot_args}"
+    cmd = cmd.split()
+    # cmd = ["pwd"]
+    print(f"!!! cmd: {cmd}")
+    worker_props = {"config": {
+        "image": image,
+        "auto_destroy": True,
+        "init": {
+            "cmd": cmd}
+    },
+        "restart": {
+            "policy": "no"
+    },
+    }
+    res = requests.post(
+        fly_api_host + f"/apps/{fly_app_name}/machines",
+        headers=fly_headers,
+        json=worker_props
+    )
+    print(f"!!! Got past the request to start a bot")
+    if res.status_code != 200:
+        raise Exception(f"Problem starting a bot worker: {res.text}")
+    print(f"!!! worker creation response: {res.text}")
+    if wait_for_bot:
+        # Don't return until the bot has joined the room, but wait for at most 5
+        # seconds.
+        attempts = 0
+        while attempts < 50:
+            time.sleep(0.1)
+            attempts += 1
+            res = requests.get(
+                f"{api_path}/rooms/{room_name}/get-session-data",
+                headers={"Authorization": f"Bearer {daily_api_key}"},
+            )
+            if res.status_code == 200:
+                print(f"Took {attempts} attempts to join room {room_name}")
+                return True
+
+        # If we don't break from the loop, that means we never found the bot in the room
+        raise Exception("The bot was unable to join the room. Please try again.")
+
+    return True
+
+
+@app.route("/start/<string:botname>", methods=["GET", "POST"])
+def start(botname):
+    try:
+        if botname not in APPS:
+            raise Exception(f"Bot '{botname}' is not in the allowlist.")
+
+        bot_path = APPS[botname]
+        props = {
+            "room_url": None,
+            "room_properties": None,
+            "token_properties": None,
+            "bot_args": None,
+            "wait_for_bot": True,
+            "duration": None,
+            "redirect": True
+        }
+        props |= request.values.to_dict()  # gets URL params as well as plaintext POST body
+        try:
+            props |= request.json
+        except BaseException:
+            pass
+        if props['redirect'] == "false":
+            props['redirect'] = False
+        if props['wait_for_bot'] == "false":
+            props['wait_for_bot'] = False
+
+        duration = int(os.getenv("DAILY_BOT_DURATION") or 7200)
+        if props['duration']:
+            duration = props['duration']
+        exp = time.time() + duration
+        if (props['room_url']):
+            room_url = props['room_url']
+            try:
+                room_name = get_room_name(room_url)
+            except ValueError:
+                raise Exception(
+                    "There was a problem detecting the room name. Please double-check the value of room_url.")
+        else:
+            room_url, room_name = create_room(props['room_properties'], exp)
+        token = create_token(room_name, props['token_properties'], exp)
+        bot = start_bot(
+            room_url=room_url,
+            bot_path=bot_path,
+            token=token,
+            bot_args=props['bot_args'],
+            wait_for_bot=props['wait_for_bot'])
+        print(f"!!! Bot is: {bot}")
+        if props['redirect'] and request.method == "GET":
+            return redirect(room_url, 302)
+        else:
+            return jsonify({"room_url": room_url, "token": token})
+    except BaseException as e:
+        return f"There was a problem starting the bot: {e}", 500
+
+
+@app.route("/healthz")
+def health_check():
+    return "ok", 200
diff --git a/src/dailyai/transports/daily_transport.py b/src/dailyai/transports/daily_transport.py
@@ -5,6 +5,7 @@
 import threading
 import types
 
+from enum import Enum
 from functools import partial
 from typing import Any
 
@@ -26,6 +27,11 @@
 
 from dailyai.transports.threaded_transport import ThreadedTransport
 
+NUM_CHANNELS = 1
+
+SPEECH_THRESHOLD = 0.90
+VAD_RESET_PERIOD_MS = 2000
+
 
 class DailyTransport(ThreadedTransport, EventHandler):
     _daily_initialized = False
@@ -48,6 +54,7 @@ def __init__(
         start_transcription: bool = False,
         **kwargs,
     ):
+        kwargs['has_webrtc_vad'] = True
         # This will call ThreadedTransport.__init__ method, not EventHandler
         super().__init__(**kwargs)
 
@@ -79,6 +86,12 @@ def __init__(
 
         self._event_handlers = {}
 
+        self.webrtc_vad = Daily.create_native_vad(
+            reset_period_ms=VAD_RESET_PERIOD_MS,
+            sample_rate=self._speaker_sample_rate,
+            channels=NUM_CHANNELS
+        )
+
     def _patch_method(self, event_name, *args, **kwargs):
         try:
             for handler in self._event_handlers[event_name]:
@@ -99,6 +112,18 @@ def _patch_method(self, event_name, *args, **kwargs):
             self._logger.error(f"Exception in event handler {event_name}: {e}")
             raise e
 
+    def _webrtc_vad_analyze(self):
+        buffer = self.read_audio_frames(
+            int(self._vad_samples))
+        if len(buffer) > 0:
+            confidence = self.webrtc_vad.analyze_frames(buffer)
+            # yeses = int(confidence * 20.0)
+            # nos = 20 - yeses
+            # out = "!" * yeses + "." * nos
+            # print(f"!!! confidence: {out} {confidence}")
+            talking = confidence > SPEECH_THRESHOLD
+            return talking
+
     def add_event_handler(self, event_name: str, handler):
         if not event_name.startswith("on_"):
             raise Exception(

diff --git a/src/dailyai/transports/threaded_transport.py b/src/dailyai/transports/threaded_transport.py
@@ -41,9 +41,6 @@ def int2float(sound):
     return sound
 
 
-SAMPLE_RATE = 16000
-
-
 class VADState(Enum):
     QUIET = 1
     STARTING = 2
@@ -62,11 +59,12 @@ def __init__(
         self._vad_stop_s = kwargs.get("vad_stop_s") or 0.8
         self._context = kwargs.get("context") or []
         self._vad_enabled = kwargs.get("vad_enabled") or False
-
+        self._has_webrtc_vad = kwargs.get("has_webrtc_vad") or False
         if self._vad_enabled and self._speaker_enabled:
             raise Exception(
                 "Sorry, you can't use speaker_enabled and vad_enabled at the same time. Please set one to False."
             )
+        self._vad_samples = 1536
 
         if self._vad_enabled:
             try:
@@ -80,14 +78,19 @@ def __init__(
                 (self.model, self.utils) = torch.hub.load(
                     repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=False
                 )
+                self._logger.debug("Loaded Silero VAD")
 
             except ModuleNotFoundError as e:
-                print(f"Exception: {e}")
-                print("In order to use VAD, you'll need to install the `torch` and `torchaudio` modules.")
-                raise Exception(f"Missing module(s): {e}")
+                if self._has_webrtc_vad:
+                    self._logger.debug(f"Couldn't load torch; using webrtc VAD")
+                    self._vad_samples = int(self._speaker_sample_rate / 100.0)
+                else:
+                    self._logger.error(f"Exception: {e}")
+                    self._logger.error(
+                        "In order to use VAD, you'll need to install the `torch` and `torchaudio` modules.")
+                    raise Exception(f"Missing module(s): {e}")
 
-        self._vad_samples = 1536
-        vad_frame_s = self._vad_samples / SAMPLE_RATE
+        vad_frame_s = self._vad_samples / self._speaker_sample_rate
         self._vad_start_frames = round(self._vad_start_s / vad_frame_s)
         self._vad_stop_frames = round(self._vad_stop_s / vad_frame_s)
         self._vad_starting_count = 0
@@ -263,19 +266,28 @@ def read_audio_frames(self, desired_frame_count):
     def _prerun(self):
         pass
 
+    def _silero_vad_analyze(self):
+        audio_chunk = self.read_audio_frames(self._vad_samples)
+        audio_int16 = np.frombuffer(audio_chunk, np.int16)
+        audio_float32 = int2float(audio_int16)
+        new_confidence = self.model(
+            torch.from_numpy(audio_float32), 16000).item()
+        # yeses = int(new_confidence * 20.0)
+        # nos = 20 - yeses
+        # out = "!" * yeses + "." * nos
+        # print(f"!!! confidence: {out}")
+        speaking = new_confidence > 0.5
+        return speaking
+
     def _vad(self):
-        # CB: Starting silero VAD stuff
-        # TODO-CB: Probably need to force virtual speaker creation if we're
-        # going to build this in?
-        # TODO-CB: pyaudio installation
-        while not self._stop_threads.is_set():
-            audio_chunk = self.read_audio_frames(self._vad_samples)
-            audio_int16 = np.frombuffer(audio_chunk, np.int16)
-            audio_float32 = int2float(audio_int16)
-            new_confidence = self.model(
-                torch.from_numpy(audio_float32), 16000).item()
-            speaking = new_confidence > 0.5
 
+        while not self._stop_threads.is_set():
+            if hasattr(self, 'model'):  # we can use Silero
+                speaking = self._silero_vad_analyze()
+            elif self._has_webrtc_vad:
+                speaking = self._webrtc_vad_analyze()
+            else:
+                raise Exception("VAD is running with no VAD service available")
             if speaking:
                 match self._vad_state:
                     case VADState.QUIET:
@@ -312,6 +324,7 @@ def _vad(self):
                 self._vad_state == VADState.STOPPING
                 and self._vad_stopping_count >= self._vad_stop_frames
             ):
+
                 if self._loop:
                     asyncio.run_coroutine_threadsafe(
                         self.receive_queue.put(