livekit · theomonnom · Jan 22, 2025 · Jan 11, 2025 · Jan 11, 2025 · Jan 11, 2025
diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/api_proto.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/api_proto.py
@@ -76,6 +76,7 @@ class ServerVad(TypedDict):
     threshold: NotRequired[float]
     prefix_padding_ms: NotRequired[int]
     silence_duration_ms: NotRequired[int]
+    create_response: NotRequired[bool]
 
 
 class FunctionTool(TypedDict):
@@ -307,6 +308,8 @@ class ResponseCreateData(TypedDict, total=False):
         tools: list[FunctionTool]
         tool_choice: ToolChoice
         temperature: float
+        conversation: Literal["auto", "none"]
+        metadata: NotRequired[map | None]
         max_output_tokens: int | Literal["inf"]
 
     class ResponseCreate(TypedDict):

diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py
@@ -76,6 +76,8 @@ class RealtimeResponse:
     """timestamp when the response was created"""
     _first_token_timestamp: float | None = None
     """timestamp when the first token was received"""
+    metadata: map | None = None
+    """developer-provided string key-value pairs"""
 
 
 @dataclass
@@ -140,6 +142,7 @@ class ServerVadOptions:
     threshold: float
     prefix_padding_ms: int
     silence_duration_ms: int
+    create_response: bool
 
 
 @dataclass
@@ -191,6 +194,7 @@ class _ContentPtr(TypedDict):
     threshold=0.5,
     prefix_padding_ms=300,
     silence_duration_ms=500,
+    create_response=True,
 )
 
 DEFAULT_INPUT_AUDIO_TRANSCRIPTION = InputTranscriptionOptions(model="whisper-1")
@@ -717,6 +721,10 @@ def create(
             on_duplicate: Literal[
                 "cancel_existing", "cancel_new", "keep_both"
             ] = "keep_both",
+            instructions: str = "",
+            modalities: list[api_proto.Modality] = ["text", "audio"],
+            conversation: Literal["auto", "none"] = "auto",
+            metadata: map | None = None,
         ) -> asyncio.Future[bool]:
             """Creates a new response.
 
@@ -725,6 +733,12 @@ def create(
                     - "cancel_existing": Cancel the existing response before creating new one
                     - "cancel_new": Skip creating new response if one is in progress
                     - "keep_both": Wait for the existing response to be done and then create a new one
+                instructions: explicit prompt used for out-of-band events
+                modalities: set of modalities that the model can respond in, defaults to audio
+                conversation: specifies whether respones is out-of-band
+                    - "auto": Contents of the response will be added to the default conversation
+                    - "none": Creates an out-of-band response which will not add items to default conversation
+                metadata: set of key-value pairs that can be used for storing additional information
 
             Returns:
                 Future that resolves when the response create request is queued
@@ -758,7 +772,17 @@ def create(
                 or self._sess._pending_responses[active_resp_id].done_fut.done()
             ):
                 # no active response in progress, create a new one
-                self._sess._queue_msg({"type": "response.create"})
+                self._sess._queue_msg(
+                    {
+                        "type": "response.create",
+                        "response": {
+                            "instructions": instructions,
+                            "modalities": modalities,
+                            "conversation": conversation,
+                            "metadata": metadata,
+                        },
+                    }
+                )
                 _fut = asyncio.Future[bool]()
                 _fut.set_result(True)
                 return _fut
@@ -795,7 +819,17 @@ async def wait_and_create() -> bool:
                 )
                 new_create_fut = asyncio.Future[None]()
                 self._sess._response_create_fut = new_create_fut
-                self._sess._queue_msg({"type": "response.create"})
+                self._sess._queue_msg(
+                    {
+                        "type": "response.create",
+                        "response": {
+                            "instructions": instructions,
+                            "modalities": modalities,
+                            "conversation": conversation,
+                            "metadata": metadata,
+                        },
+                    }
+                )
                 return True
 
             return asyncio.create_task(wait_and_create())
@@ -928,6 +962,7 @@ def session_update(
                 "threshold": self._opts.turn_detection.threshold,
                 "prefix_padding_ms": self._opts.turn_detection.prefix_padding_ms,
                 "silence_duration_ms": self._opts.turn_detection.silence_duration_ms,
+                "create_response": self._opts.turn_detection.create_response,
             }
         input_audio_transcription_opts: api_proto.InputAudioTranscription | None = None
         if self._opts.input_audio_transcription is not None:
@@ -1228,6 +1263,7 @@ def _handle_session_updated(
                 threshold=session["turn_detection"]["threshold"],
                 prefix_padding_ms=session["turn_detection"]["prefix_padding_ms"],
                 silence_duration_ms=session["turn_detection"]["silence_duration_ms"],
+                create_response=True,
             )
         if session["input_audio_transcription"] is None:
             input_audio_transcription = None
@@ -1407,11 +1443,13 @@ def _handle_response_created(
         response = response_created["response"]
         done_fut = self._loop.create_future()
         status_details = response.get("status_details")
+        metadata = cast(map, response.get("metadata"))
         new_response = RealtimeResponse(
             id=response["id"],
             status=response["status"],
             status_details=status_details,
             output=[],
+            metadata=metadata,
             usage=response.get("usage"),
             done_fut=done_fut,
             _created_timestamp=time.time(),
@@ -1586,6 +1624,8 @@ def _handle_response_done(self, response_done: api_proto.ServerEvent.ResponseDon
 
         response.status = response_data["status"]
         response.status_details = response_data.get("status_details")
+        response.metadata = cast(map, response_data.get("metadata"))
+        response.output = cast(list[RealtimeOutput], response_data.get("output"))
         response.usage = response_data.get("usage")
 
         metrics_error = None