diff --git a/CHANGELOG.md b/CHANGELOG.md index 3545b0744..932ccd8b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added a new foundational example `07e-interruptible-playht-http.py` for easy + testing of `PlayHTHttpTTSService`. + - Added support for Google TTS Journey voices in `GoogleTTSService`. - Added `29-livekit-audio-chat.py`, as a new foundational examples for @@ -27,12 +30,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed +- Changed the default model for `PlayHTHttpTTSService` to `Play3.0-mini-http`. + - api_key, aws_access_key_id and region are no longer required parameters for the PollyTTSService (AWSTTSService) + - Added `session_timeout` example in `examples/websocket-server/bot.py` to handle session timeout event. + - Changed `InputParams` in `src/pipecat/services/gemini_multimodal_live/gemini.py` to support different modalities. ### Fixed +- Fixed an import issue for `PlayHTHttpTTSService`. + +- Fixed an issue where languages couldn't be used with the `PlayHTHttpTTSService`. + - Fixed an issue where `OpenAIRealtimeBetaLLMService` audio chunks were hitting an error when truncating audio content. diff --git a/examples/foundational/07e-interruptible-playht-http.py b/examples/foundational/07e-interruptible-playht-http.py new file mode 100644 index 000000000..af2844ff5 --- /dev/null +++ b/examples/foundational/07e-interruptible-playht-http.py @@ -0,0 +1,101 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import os +import sys + +import aiohttp +from dotenv import load_dotenv +from loguru import logger +from runner import configure + +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.frames.frames import LLMMessagesFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.services.openai import OpenAILLMService +from pipecat.services.playht import PlayHTHttpTTSService +from pipecat.transcriptions.language import Language +from pipecat.transports.services.daily import DailyParams, DailyTransport + +load_dotenv(override=True) + +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + + +async def main(): + async with aiohttp.ClientSession() as session: + (room_url, token) = await configure(session) + + transport = DailyTransport( + room_url, + token, + "Respond bot", + DailyParams( + audio_out_enabled=True, + transcription_enabled=True, + vad_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), + ) + + tts = PlayHTHttpTTSService( + user_id=os.getenv("PLAYHT_USER_ID"), + api_key=os.getenv("PLAYHT_API_KEY"), + voice_url="s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json", + ) + + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") + + messages = [ + { + "role": "system", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", + }, + ] + + context = OpenAILLMContext(messages) + context_aggregator = llm.create_context_aggregator(context) + + pipeline = Pipeline( + [ + transport.input(), # Transport user input + context_aggregator.user(), # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + context_aggregator.assistant(), # Assistant spoken responses + ] + ) + + task = PipelineTask( + pipeline, + PipelineParams( + allow_interruptions=True, + enable_metrics=True, + enable_usage_metrics=True, + report_only_initial_ttfb=True, + ), + ) + + @transport.event_handler("on_first_participant_joined") + async def on_first_participant_joined(transport, participant): + await transport.capture_participant_transcription(participant["id"]) + # Kick off the conversation. + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) + await task.queue_frames([LLMMessagesFrame(messages)]) + + runner = PipelineRunner() + + await runner.run(task) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/pipecat/services/playht.py b/src/pipecat/services/playht.py index ddbcd76b1..af490e5f1 100644 --- a/src/pipecat/services/playht.py +++ b/src/pipecat/services/playht.py @@ -37,8 +37,8 @@ try: from pyht.async_client import AsyncClient - from pyht.client import TTSOptions - from pyht.protos.api_pb2 import Format + from pyht.client import Format, TTSOptions + from pyht.client import Language as PlayHTLanguage except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( @@ -363,7 +363,7 @@ def __init__( api_key: str, user_id: str, voice_url: str, - voice_engine: str = "Play3.0-mini", + voice_engine: str = "Play3.0-mini-http", # Options: Play3.0-mini-http, Play3.0-mini-ws sample_rate: int = 24000, params: InputParams = InputParams(), **kwargs, @@ -389,9 +389,19 @@ def __init__( } self.set_model_name(voice_engine) self.set_voice(voice_url) + + language_str = self._settings["language"] + playht_language = None + if language_str: + # Convert string to PlayHT Language enum + for lang in PlayHTLanguage: + if lang.value == language_str: + playht_language = lang + break + self._options = TTSOptions( voice=self._voice_id, - language=self._settings["language"], + language=playht_language, sample_rate=self._settings["sample_rate"], format=self._settings["format"], speed=self._settings["speed"],