From f406d93b0fdcf8349ded9b502ca0caeaf135f0c9 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Sun, 12 Jan 2025 11:35:07 -0500 Subject: [PATCH] Align 26d example with foundation norms --- .../26d-gemini-multimodal-live-text.py | 49 +++++++++++++++++-- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/examples/foundational/26d-gemini-multimodal-live-text.py b/examples/foundational/26d-gemini-multimodal-live-text.py index 493c2983a..815f32165 100644 --- a/examples/foundational/26d-gemini-multimodal-live-text.py +++ b/examples/foundational/26d-gemini-multimodal-live-text.py @@ -15,13 +15,16 @@ from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.audio.vad.vad_analyzer import VADParams +from pipecat.frames.frames import EndFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.gemini_multimodal_live.gemini import ( GeminiMultimodalLiveLLMService, GeminiMultimodalModalities, + InputParams, ) from pipecat.transports.services.daily import DailyParams, DailyTransport @@ -30,6 +33,16 @@ logger.remove(0) logger.add(sys.stderr, level="DEBUG") +SYSTEM_INSTRUCTION = f""" +"You are Gemini Chatbot, a friendly, helpful robot. + +Your goal is to demonstrate your capabilities in a succinct way. + +Your output will be converted to audio so don't include special characters in your answers. + +Respond to what the user said in a creative and helpful way. Keep your responses brief. One or two sentences at most. +""" + async def main(): async with aiohttp.ClientSession() as session: @@ -55,24 +68,42 @@ async def main(): llm = GeminiMultimodalLiveLLMService( api_key=os.getenv("GOOGLE_API_KEY"), - # system_instruction="Talk like a pirate." transcribe_user_audio=True, transcribe_model_audio=True, + system_instruction=SYSTEM_INSTRUCTION, + tools=[{"google_search": {}}, {"code_execution": {}}], + params=InputParams(modalities=GeminiMultimodalModalities.TEXT), ) - llm.set_model_modalities( - GeminiMultimodalModalities.TEXT - ) # This forces model to produce text only responses + + # Optionally, you can set the response modalities via a function + # llm.set_model_modalities( + # GeminiMultimodalModalities.TEXT + # ) tts = CartesiaTTSService( api_key=os.getenv("CARTESIA_API_KEY"), voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22" ) + messages = [ + { + "role": "user", + "content": 'Start by saying "Hello, I\'m Gemini".', + }, + ] + + # Set up conversation context and management + # The context_aggregator will automatically collect conversation context + context = OpenAILLMContext(messages) + context_aggregator = llm.create_context_aggregator(context) + pipeline = Pipeline( [ transport.input(), + context_aggregator.user(), llm, tts, transport.output(), + context_aggregator.assistant(), ] ) @@ -85,6 +116,16 @@ async def main(): ), ) + @transport.event_handler("on_first_participant_joined") + async def on_first_participant_joined(transport, participant): + await transport.capture_participant_transcription(participant["id"]) + await task.queue_frames([context_aggregator.user().get_context_frame()]) + + @transport.event_handler("on_participant_left") + async def on_participant_left(transport, participant, reason): + print(f"Participant left: {participant}") + await task.queue_frame(EndFrame()) + runner = PipelineRunner() await runner.run(task)