diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/api_proto.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/api_proto.py index 2bf9778d3..f580035cc 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/api_proto.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/api_proto.py @@ -76,6 +76,7 @@ class ServerVad(TypedDict): threshold: NotRequired[float] prefix_padding_ms: NotRequired[int] silence_duration_ms: NotRequired[int] + create_response: NotRequired[bool] class FunctionTool(TypedDict): @@ -307,6 +308,8 @@ class ResponseCreateData(TypedDict, total=False): tools: list[FunctionTool] tool_choice: ToolChoice temperature: float + conversation: Literal["auto", "none"] + metadata: NotRequired[map | None] max_output_tokens: int | Literal["inf"] class ResponseCreate(TypedDict): diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py index 8b6b717f7..84420811b 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py @@ -76,6 +76,8 @@ class RealtimeResponse: """timestamp when the response was created""" _first_token_timestamp: float | None = None """timestamp when the first token was received""" + metadata: map | None = None + """developer-provided string key-value pairs""" @dataclass @@ -140,6 +142,7 @@ class ServerVadOptions: threshold: float prefix_padding_ms: int silence_duration_ms: int + create_response: bool @dataclass @@ -191,6 +194,7 @@ class _ContentPtr(TypedDict): threshold=0.5, prefix_padding_ms=300, silence_duration_ms=500, + create_response=True, ) DEFAULT_INPUT_AUDIO_TRANSCRIPTION = InputTranscriptionOptions(model="whisper-1") @@ -717,6 +721,10 @@ def create( on_duplicate: Literal[ "cancel_existing", "cancel_new", "keep_both" ] = "keep_both", + instructions: str = "", + modalities: list[api_proto.Modality] = ["text", "audio"], + conversation: Literal["auto", "none"] = "auto", + metadata: map | None = None, ) -> asyncio.Future[bool]: """Creates a new response. @@ -725,6 +733,12 @@ def create( - "cancel_existing": Cancel the existing response before creating new one - "cancel_new": Skip creating new response if one is in progress - "keep_both": Wait for the existing response to be done and then create a new one + instructions: explicit prompt used for out-of-band events + modalities: set of modalities that the model can respond in, defaults to audio + conversation: specifies whether respones is out-of-band + - "auto": Contents of the response will be added to the default conversation + - "none": Creates an out-of-band response which will not add items to default conversation + metadata: set of key-value pairs that can be used for storing additional information Returns: Future that resolves when the response create request is queued @@ -758,7 +772,17 @@ def create( or self._sess._pending_responses[active_resp_id].done_fut.done() ): # no active response in progress, create a new one - self._sess._queue_msg({"type": "response.create"}) + self._sess._queue_msg( + { + "type": "response.create", + "response": { + "instructions": instructions, + "modalities": modalities, + "conversation": conversation, + "metadata": metadata, + }, + } + ) _fut = asyncio.Future[bool]() _fut.set_result(True) return _fut @@ -795,7 +819,17 @@ async def wait_and_create() -> bool: ) new_create_fut = asyncio.Future[None]() self._sess._response_create_fut = new_create_fut - self._sess._queue_msg({"type": "response.create"}) + self._sess._queue_msg( + { + "type": "response.create", + "response": { + "instructions": instructions, + "modalities": modalities, + "conversation": conversation, + "metadata": metadata, + }, + } + ) return True return asyncio.create_task(wait_and_create()) @@ -928,6 +962,7 @@ def session_update( "threshold": self._opts.turn_detection.threshold, "prefix_padding_ms": self._opts.turn_detection.prefix_padding_ms, "silence_duration_ms": self._opts.turn_detection.silence_duration_ms, + "create_response": self._opts.turn_detection.create_response, } input_audio_transcription_opts: api_proto.InputAudioTranscription | None = None if self._opts.input_audio_transcription is not None: @@ -1228,6 +1263,7 @@ def _handle_session_updated( threshold=session["turn_detection"]["threshold"], prefix_padding_ms=session["turn_detection"]["prefix_padding_ms"], silence_duration_ms=session["turn_detection"]["silence_duration_ms"], + create_response=True, ) if session["input_audio_transcription"] is None: input_audio_transcription = None @@ -1407,11 +1443,13 @@ def _handle_response_created( response = response_created["response"] done_fut = self._loop.create_future() status_details = response.get("status_details") + metadata = cast(map, response.get("metadata")) new_response = RealtimeResponse( id=response["id"], status=response["status"], status_details=status_details, output=[], + metadata=metadata, usage=response.get("usage"), done_fut=done_fut, _created_timestamp=time.time(), @@ -1586,6 +1624,8 @@ def _handle_response_done(self, response_done: api_proto.ServerEvent.ResponseDon response.status = response_data["status"] response.status_details = response_data.get("status_details") + response.metadata = cast(map, response_data.get("metadata")) + response.output = cast(list[RealtimeOutput], response_data.get("output")) response.usage = response_data.get("usage") metrics_error = None