Merge pull request #974 from pipecat-ai/mb/26d-example

Align 26d example with foundation norms
pipecat-ai · Jan 13, 2025 · da18785 · da18785
2 parents 3cd2b90 + f406d93
commit da18785
Showing 1 changed file with 45 additions and 4 deletions.
diff --git a/examples/foundational/26d-gemini-multimodal-live-text.py b/examples/foundational/26d-gemini-multimodal-live-text.py
@@ -15,13 +15,16 @@
 
 from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.audio.vad.vad_analyzer import VADParams
+from pipecat.frames.frames import EndFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
 from pipecat.services.cartesia import CartesiaTTSService
 from pipecat.services.gemini_multimodal_live.gemini import (
     GeminiMultimodalLiveLLMService,
     GeminiMultimodalModalities,
+    InputParams,
 )
 from pipecat.transports.services.daily import DailyParams, DailyTransport
 
@@ -30,6 +33,16 @@
 logger.remove(0)
 logger.add(sys.stderr, level="DEBUG")
 
+SYSTEM_INSTRUCTION = f"""
+"You are Gemini Chatbot, a friendly, helpful robot.
+
+Your goal is to demonstrate your capabilities in a succinct way.
+
+Your output will be converted to audio so don't include special characters in your answers.
+
+Respond to what the user said in a creative and helpful way. Keep your responses brief. One or two sentences at most.
+"""
+
 
 async def main():
     async with aiohttp.ClientSession() as session:
@@ -55,24 +68,42 @@ async def main():
 
         llm = GeminiMultimodalLiveLLMService(
             api_key=os.getenv("GOOGLE_API_KEY"),
-            # system_instruction="Talk like a pirate."
             transcribe_user_audio=True,
             transcribe_model_audio=True,
+            system_instruction=SYSTEM_INSTRUCTION,
+            tools=[{"google_search": {}}, {"code_execution": {}}],
+            params=InputParams(modalities=GeminiMultimodalModalities.TEXT),
         )
-        llm.set_model_modalities(
-            GeminiMultimodalModalities.TEXT
-        )  # This forces model to produce text only responses
+
+        # Optionally, you can set the response modalities via a function
+        # llm.set_model_modalities(
+        #     GeminiMultimodalModalities.TEXT
+        # )
 
         tts = CartesiaTTSService(
             api_key=os.getenv("CARTESIA_API_KEY"), voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22"
         )
 
+        messages = [
+            {
+                "role": "user",
+                "content": 'Start by saying "Hello, I\'m Gemini".',
+            },
+        ]
+
+        # Set up conversation context and management
+        # The context_aggregator will automatically collect conversation context
+        context = OpenAILLMContext(messages)
+        context_aggregator = llm.create_context_aggregator(context)
+
         pipeline = Pipeline(
             [
                 transport.input(),
+                context_aggregator.user(),
                 llm,
                 tts,
                 transport.output(),
+                context_aggregator.assistant(),
             ]
         )
 
@@ -85,6 +116,16 @@ async def main():
             ),
         )
 
+        @transport.event_handler("on_first_participant_joined")
+        async def on_first_participant_joined(transport, participant):
+            await transport.capture_participant_transcription(participant["id"])
+            await task.queue_frames([context_aggregator.user().get_context_frame()])
+
+        @transport.event_handler("on_participant_left")
+        async def on_participant_left(transport, participant, reason):
+            print(f"Participant left: {participant}")
+            await task.queue_frame(EndFrame())
+
         runner = PipelineRunner()
 
         await runner.run(task)