Merge pull request #258 from pipecat-ai/aleix/upgrade-cartesia-1.0.0

services(cartesia): upgrade to new cartesia 1.0.0
pipecat-ai · Jun 25, 2024 · 253530a · 253530a
2 parents 84074e9 + 4f38d98
commit 253530a
Show file tree

Hide file tree

Showing 8 changed files with 31 additions and 29 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- Upgraded to Cartesia's new Python library 1.0.0. `CartesiaTTSService` now
+  expects a voice ID instead of a voice name (you can get the voice ID from
+  Cartesia's playground). You can also specify the audio `sample_rate` and
+  `encoding` instead of the previous `output_format`.
+
 ### Fixed
 
 - Fixed an issue with asynchronous STT services (Deepgram and Azure) that could

diff --git a/examples/foundational/07d-interruptible-cartesia.py b/examples/foundational/07d-interruptible-cartesia.py
@@ -38,7 +38,6 @@ async def main(room_url: str, token):
         "Respond bot",
         DailyParams(
             audio_out_enabled=True,
-            audio_out_sample_rate=44100,
             transcription_enabled=True,
             vad_enabled=True,
             vad_analyzer=SileroVADAnalyzer()
@@ -47,8 +46,7 @@ async def main(room_url: str, token):
 
     tts = CartesiaTTSService(
         api_key=os.getenv("CARTESIA_API_KEY"),
-        voice_name="British Lady",
-        output_format="pcm_44100"
+        voice_id="a0e99841-438c-4a64-b679-ae501e7d6091",  # Barbershop Man
     )
 
     llm = OpenAILLMService(

diff --git a/examples/foundational/15-switch-voices.py b/examples/foundational/15-switch-voices.py
@@ -66,7 +66,6 @@ async def main(room_url: str, token):
             "Pipecat",
             DailyParams(
                 audio_out_enabled=True,
-                audio_out_sample_rate=44100,
                 transcription_enabled=True,
                 vad_enabled=True,
                 vad_analyzer=SileroVADAnalyzer()
@@ -75,20 +74,17 @@ async def main(room_url: str, token):
 
         news_lady = CartesiaTTSService(
             api_key=os.getenv("CARTESIA_API_KEY"),
-            voice_name="Newslady",
-            output_format="pcm_44100"
+            voice_id="bf991597-6c13-47e4-8411-91ec2de5c466",  # Newslady
         )
 
         british_lady = CartesiaTTSService(
             api_key=os.getenv("CARTESIA_API_KEY"),
-            voice_name="British Lady",
-            output_format="pcm_44100"
+            voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22",  # British Lady
         )
 
         barbershop_man = CartesiaTTSService(
             api_key=os.getenv("CARTESIA_API_KEY"),
-            voice_name="Barbershop Man",
-            output_format="pcm_44100"
+            voice_id="a0e99841-438c-4a64-b679-ae501e7d6091",  # Barbershop Man
         )
 
         llm = OpenAILLMService(

diff --git a/linux-py3.10-requirements.txt b/linux-py3.10-requirements.txt
@@ -44,7 +44,7 @@ blinker==1.8.2
     # via flask
 cachetools==5.3.3
     # via google-auth
-cartesia==0.1.1
+cartesia==1.0.0
     # via pipecat-ai (pyproject.toml)
 certifi==2024.6.2
     # via

diff --git a/macos-py3.10-requirements.txt b/macos-py3.10-requirements.txt
@@ -44,7 +44,7 @@ blinker==1.8.2
     # via flask
 cachetools==5.3.3
     # via google-auth
-cartesia==0.1.1
+cartesia==1.0.0
     # via pipecat-ai (pyproject.toml)
 certifi==2024.6.2
     # via
@@ -210,7 +210,7 @@ langchain-core==0.2.9
     #   langchain-community
     #   langchain-openai
     #   langchain-text-splitters
-langchain-openai==0.1.9
+langchain-openai==0.1.10
     # via pipecat-ai (pyproject.toml)
 langchain-text-splitters==0.2.1
     # via langchain

diff --git a/pyproject.toml b/pyproject.toml
@@ -36,7 +36,7 @@ Website = "https://pipecat.ai"
 [project.optional-dependencies]
 anthropic = [ "anthropic~=0.25.7" ]
 azure = [ "azure-cognitiveservices-speech~=1.37.0" ]
-cartesia = [ "cartesia~=0.1.1" ]
+cartesia = [ "cartesia~=1.0.0" ]
 daily = [ "daily-python~=0.10.1" ]
 deepgram = [ "deepgram-sdk~=3.2.7" ]
 examples = [ "python-dotenv~=1.0.0", "flask~=3.0.3", "flask_cors~=4.0.1" ]

diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
@@ -16,14 +16,13 @@
     EndFrame,
     ErrorFrame,
     Frame,
-    LLMFullResponseStartFrame,
+    LLMFullResponseEndFrame,
     StartFrame,
     StartInterruptionFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
     TextFrame,
     VisionImageRawFrame,
-    LLMFullResponseEndFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.utils.audio import calculate_audio_volume

diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py
@@ -4,7 +4,7 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
-from cartesia.tts import AsyncCartesiaTTS
+from cartesia import AsyncCartesia
 
 from typing import AsyncGenerator
 
@@ -20,22 +20,24 @@ def __init__(
             self,
             *,
             api_key: str,
-            voice_name: str,
-            model_id: str = "upbeat-moon",
-            output_format: str = "pcm_16000",
+            voice_id: str,
+            model_id: str = "sonic-english",
+            encoding: str = "pcm_s16le",
+            sample_rate: int = 16000,
             **kwargs):
         super().__init__(**kwargs)
 
         self._api_key = api_key
-        self._voice_name = voice_name
         self._model_id = model_id
-        self._output_format = output_format
+        self._output_format = {
+            "container": "raw",
+            "encoding": encoding,
+            "sample_rate": sample_rate,
+        }
 
         try:
-            self._client = AsyncCartesiaTTS(api_key=self._api_key)
-            voices = self._client.get_voices()
-            voice_id = voices[self._voice_name]["id"]
-            self._voice = self._client.get_voice_embedding(voice_id=voice_id)
+            self._client = AsyncCartesia(api_key=self._api_key)
+            self._voice = self._client.voices.get(id=voice_id)
         except Exception as e:
             logger.error(f"{self} initialization error: {e}")
 
@@ -48,16 +50,16 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
         try:
             await self.start_ttfb_metrics()
 
-            chunk_generator = await self._client.generate(
+            chunk_generator = await self._client.tts.sse(
                 stream=True,
                 transcript=text,
-                voice=self._voice,
+                voice_embedding=self._voice["embedding"],
                 model_id=self._model_id,
                 output_format=self._output_format,
             )
 
             async for chunk in chunk_generator:
                 await self.stop_ttfb_metrics()
-                yield AudioRawFrame(chunk["audio"], chunk["sampling_rate"], 1)
+                yield AudioRawFrame(chunk["audio"], self._output_format["sample_rate"], 1)
         except Exception as e:
             logger.error(f"{self} exception: {e}")