Skip to content

Commit

Permalink
Merge pull request #974 from pipecat-ai/mb/26d-example
Browse files Browse the repository at this point in the history
Align 26d example with foundation norms
  • Loading branch information
markbackman authored Jan 13, 2025
2 parents 3cd2b90 + f406d93 commit da18785
Showing 1 changed file with 45 additions and 4 deletions.
49 changes: 45 additions & 4 deletions examples/foundational/26d-gemini-multimodal-live-text.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,16 @@

from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import EndFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.gemini_multimodal_live.gemini import (
GeminiMultimodalLiveLLMService,
GeminiMultimodalModalities,
InputParams,
)
from pipecat.transports.services.daily import DailyParams, DailyTransport

Expand All @@ -30,6 +33,16 @@
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")

SYSTEM_INSTRUCTION = f"""
"You are Gemini Chatbot, a friendly, helpful robot.
Your goal is to demonstrate your capabilities in a succinct way.
Your output will be converted to audio so don't include special characters in your answers.
Respond to what the user said in a creative and helpful way. Keep your responses brief. One or two sentences at most.
"""


async def main():
async with aiohttp.ClientSession() as session:
Expand All @@ -55,24 +68,42 @@ async def main():

llm = GeminiMultimodalLiveLLMService(
api_key=os.getenv("GOOGLE_API_KEY"),
# system_instruction="Talk like a pirate."
transcribe_user_audio=True,
transcribe_model_audio=True,
system_instruction=SYSTEM_INSTRUCTION,
tools=[{"google_search": {}}, {"code_execution": {}}],
params=InputParams(modalities=GeminiMultimodalModalities.TEXT),
)
llm.set_model_modalities(
GeminiMultimodalModalities.TEXT
) # This forces model to produce text only responses

# Optionally, you can set the response modalities via a function
# llm.set_model_modalities(
# GeminiMultimodalModalities.TEXT
# )

tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"), voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22"
)

messages = [
{
"role": "user",
"content": 'Start by saying "Hello, I\'m Gemini".',
},
]

# Set up conversation context and management
# The context_aggregator will automatically collect conversation context
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)

pipeline = Pipeline(
[
transport.input(),
context_aggregator.user(),
llm,
tts,
transport.output(),
context_aggregator.assistant(),
]
)

Expand All @@ -85,6 +116,16 @@ async def main():
),
)

@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
await task.queue_frames([context_aggregator.user().get_context_frame()])

@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
print(f"Participant left: {participant}")
await task.queue_frame(EndFrame())

runner = PipelineRunner()

await runner.run(task)
Expand Down

0 comments on commit da18785

Please sign in to comment.