Skip to content

Commit

Permalink
Merge pull request #258 from pipecat-ai/aleix/upgrade-cartesia-1.0.0
Browse files Browse the repository at this point in the history
services(cartesia): upgrade to new cartesia 1.0.0
  • Loading branch information
aconchillo authored Jun 25, 2024
2 parents 84074e9 + 4f38d98 commit 253530a
Show file tree
Hide file tree
Showing 8 changed files with 31 additions and 29 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Changed

- Upgraded to Cartesia's new Python library 1.0.0. `CartesiaTTSService` now
expects a voice ID instead of a voice name (you can get the voice ID from
Cartesia's playground). You can also specify the audio `sample_rate` and
`encoding` instead of the previous `output_format`.

### Fixed

- Fixed an issue with asynchronous STT services (Deepgram and Azure) that could
Expand Down
4 changes: 1 addition & 3 deletions examples/foundational/07d-interruptible-cartesia.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ async def main(room_url: str, token):
"Respond bot",
DailyParams(
audio_out_enabled=True,
audio_out_sample_rate=44100,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
Expand All @@ -47,8 +46,7 @@ async def main(room_url: str, token):

tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_name="British Lady",
output_format="pcm_44100"
voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
)

llm = OpenAILLMService(
Expand Down
10 changes: 3 additions & 7 deletions examples/foundational/15-switch-voices.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ async def main(room_url: str, token):
"Pipecat",
DailyParams(
audio_out_enabled=True,
audio_out_sample_rate=44100,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
Expand All @@ -75,20 +74,17 @@ async def main(room_url: str, token):

news_lady = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_name="Newslady",
output_format="pcm_44100"
voice_id="bf991597-6c13-47e4-8411-91ec2de5c466", # Newslady
)

british_lady = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_name="British Lady",
output_format="pcm_44100"
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)

barbershop_man = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_name="Barbershop Man",
output_format="pcm_44100"
voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
)

llm = OpenAILLMService(
Expand Down
2 changes: 1 addition & 1 deletion linux-py3.10-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ blinker==1.8.2
# via flask
cachetools==5.3.3
# via google-auth
cartesia==0.1.1
cartesia==1.0.0
# via pipecat-ai (pyproject.toml)
certifi==2024.6.2
# via
Expand Down
4 changes: 2 additions & 2 deletions macos-py3.10-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ blinker==1.8.2
# via flask
cachetools==5.3.3
# via google-auth
cartesia==0.1.1
cartesia==1.0.0
# via pipecat-ai (pyproject.toml)
certifi==2024.6.2
# via
Expand Down Expand Up @@ -210,7 +210,7 @@ langchain-core==0.2.9
# langchain-community
# langchain-openai
# langchain-text-splitters
langchain-openai==0.1.9
langchain-openai==0.1.10
# via pipecat-ai (pyproject.toml)
langchain-text-splitters==0.2.1
# via langchain
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ Website = "https://pipecat.ai"
[project.optional-dependencies]
anthropic = [ "anthropic~=0.25.7" ]
azure = [ "azure-cognitiveservices-speech~=1.37.0" ]
cartesia = [ "cartesia~=0.1.1" ]
cartesia = [ "cartesia~=1.0.0" ]
daily = [ "daily-python~=0.10.1" ]
deepgram = [ "deepgram-sdk~=3.2.7" ]
examples = [ "python-dotenv~=1.0.0", "flask~=3.0.3", "flask_cors~=4.0.1" ]
Expand Down
3 changes: 1 addition & 2 deletions src/pipecat/services/ai_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,13 @@
EndFrame,
ErrorFrame,
Frame,
LLMFullResponseStartFrame,
LLMFullResponseEndFrame,
StartFrame,
StartInterruptionFrame,
TTSStartedFrame,
TTSStoppedFrame,
TextFrame,
VisionImageRawFrame,
LLMFullResponseEndFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.utils.audio import calculate_audio_volume
Expand Down
28 changes: 15 additions & 13 deletions src/pipecat/services/cartesia.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# SPDX-License-Identifier: BSD 2-Clause License
#

from cartesia.tts import AsyncCartesiaTTS
from cartesia import AsyncCartesia

from typing import AsyncGenerator

Expand All @@ -20,22 +20,24 @@ def __init__(
self,
*,
api_key: str,
voice_name: str,
model_id: str = "upbeat-moon",
output_format: str = "pcm_16000",
voice_id: str,
model_id: str = "sonic-english",
encoding: str = "pcm_s16le",
sample_rate: int = 16000,
**kwargs):
super().__init__(**kwargs)

self._api_key = api_key
self._voice_name = voice_name
self._model_id = model_id
self._output_format = output_format
self._output_format = {
"container": "raw",
"encoding": encoding,
"sample_rate": sample_rate,
}

try:
self._client = AsyncCartesiaTTS(api_key=self._api_key)
voices = self._client.get_voices()
voice_id = voices[self._voice_name]["id"]
self._voice = self._client.get_voice_embedding(voice_id=voice_id)
self._client = AsyncCartesia(api_key=self._api_key)
self._voice = self._client.voices.get(id=voice_id)
except Exception as e:
logger.error(f"{self} initialization error: {e}")

Expand All @@ -48,16 +50,16 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
try:
await self.start_ttfb_metrics()

chunk_generator = await self._client.generate(
chunk_generator = await self._client.tts.sse(
stream=True,
transcript=text,
voice=self._voice,
voice_embedding=self._voice["embedding"],
model_id=self._model_id,
output_format=self._output_format,
)

async for chunk in chunk_generator:
await self.stop_ttfb_metrics()
yield AudioRawFrame(chunk["audio"], chunk["sampling_rate"], 1)
yield AudioRawFrame(chunk["audio"], self._output_format["sample_rate"], 1)
except Exception as e:
logger.error(f"{self} exception: {e}")

0 comments on commit 253530a

Please sign in to comment.