From 6c726e79cd99398b807fd8f158a8718e5f5844d9 Mon Sep 17 00:00:00 2001
From: Hayden Housen <hayden@haydenhousen.com>
Date: Fri, 11 Aug 2023 19:45:22 +0000
Subject: [PATCH] Add llama.cpp agent (#254)

* Add llama.cpp agent

* Add some docs

* Add LlamacppAgent to AgentFactory

* Remove comment

* Refactor duplicated sentence parsing logic

* Merge ActionAgent into Base and RespondAgents and make sentence splitting vendor agnostic

* Fix some mypy

* factor out OpenAI token logic and complete ActionAgent merge

* factor out function call logic

* docs updates and cleanup/revert prompt change

* pass mypy

* pr changes and test

* improve type hinting

* add function test

* add test case where the request is only func call

* Use new collate_response_async function

* Fix mypy

* merge tests

---------

Co-authored-by: zaptrem <zaptrem@me.com>
Co-authored-by: Ajay Raj <ajay.n.raj@gmail.com>
---
 docs/local-conversation.mdx              |  42 +++++++
 tests/streaming/agent/test_utils.py      |   2 +-
 vocode/streaming/agent/factory.py        |   4 +
 vocode/streaming/agent/llamacpp_agent.py | 148 +++++++++++++++++++++++
 vocode/streaming/models/agent.py         |   8 ++
 5 files changed, 203 insertions(+), 1 deletion(-)
 create mode 100644 vocode/streaming/agent/llamacpp_agent.py

diff --git a/docs/local-conversation.mdx b/docs/local-conversation.mdx
index 32fe7ac21..97aef12f5 100644
--- a/docs/local-conversation.mdx
+++ b/docs/local-conversation.mdx
@@ -105,6 +105,48 @@ StreamingConversation(
 )
 ```
 
+## Llama.cpp
+
+You can use any model supported by [llama.cpp](https://github.com/ggerganov/llama.cpp) with Vocode. This includes LLaMA, Alpaca, Vicuna, Koala, WizardLM, and more. We will use [NousResearch/Nous-Hermes-13b](https://huggingface.co/NousResearch/Nous-Hermes-13b) in this example because it currently ranks highly on HuggingFace's [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).
+
+Our implementation is built on top of [langchain](https://python.langchain.com/docs/modules/model_io/models/llms/integrations/llamacpp), which integrates with llama.cpp through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python).
+
+Install `llama-cpp-python` by running the following:
+
+```
+pip install llama-cpp-python
+```
+
+or run the following to install it with support for offloading model layers to a GPU via CUDA:
+
+```
+CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
+```
+
+[llama-cpp-python](https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal) has more installation commands for different BLAS backends.
+
+Set up your agent in `StreamingConversation` as follows:
+
+```python
+from vocode.streaming.models.agent import LlamacppAgentConfig
+from vocode.streaming.agent.llamacpp_agent import LlamacppAgent
+
+StreamingConversation(
+    ...
+    agent=LlamacppAgent(
+        LlamacppAgentConfig(
+            prompt_preamble="The AI is having a pleasant conversation about life",
+            llamacpp_kwargs={"model_path": "path/to/nous-hermes-13b.ggmlv3.q4_0.bin", "verbose": True},
+            prompt_template="alpaca",
+            initial_message=BaseMessage(text="Hello!"),
+        )
+    )
+    ...
+)
+```
+
+You can add the key `n_gpu_layers` to the `llamacpp_kwargs` to offload some of the model's layers to a GPU.
+
 ## Coqui TTS
 
 Install the Coqui TTS package by running:
diff --git a/tests/streaming/agent/test_utils.py b/tests/streaming/agent/test_utils.py
index 0769954da..96cb4848e 100644
--- a/tests/streaming/agent/test_utils.py
+++ b/tests/streaming/agent/test_utils.py
@@ -299,7 +299,7 @@ class StreamOpenAIResponseTestCase(BaseModel):
 
 
 @pytest.mark.asyncio
-async def test_stream_openai_response_async():
+async def test_collate_response_async():
     test_cases = [
         StreamOpenAIResponseTestCase(
             openai_objects=[
diff --git a/vocode/streaming/agent/factory.py b/vocode/streaming/agent/factory.py
index 1f0c06a53..3cd0f162f 100644
--- a/vocode/streaming/agent/factory.py
+++ b/vocode/streaming/agent/factory.py
@@ -10,6 +10,7 @@
 from vocode.streaming.agent.restful_user_implemented_agent import (
     RESTfulUserImplementedAgent,
 )
+from vocode.streaming.agent.llamacpp_agent import LlamacppAgent
 from vocode.streaming.models.agent import (
     AgentConfig,
     AgentType,
@@ -19,6 +20,7 @@
     InformationRetrievalAgentConfig,
     LLMAgentConfig,
     RESTfulUserImplementedAgentConfig,
+    LlamacppAgentConfig
 )
 
 
@@ -38,4 +40,6 @@ def create_agent(
             return RESTfulUserImplementedAgent(agent_config=agent_config, logger=logger)
         elif isinstance(agent_config, ChatAnthropicAgentConfig):
             return ChatAnthropicAgent(agent_config=agent_config, logger=logger)
+        elif isinstance(agent_config, LlamacppAgentConfig):
+            return LlamacppAgent(agent_config=agent_config, logger=logger)
         raise Exception("Invalid agent config", agent_config.type)
diff --git a/vocode/streaming/agent/llamacpp_agent.py b/vocode/streaming/agent/llamacpp_agent.py
new file mode 100644
index 000000000..10c88566f
--- /dev/null
+++ b/vocode/streaming/agent/llamacpp_agent.py
@@ -0,0 +1,148 @@
+from concurrent.futures import ThreadPoolExecutor
+import asyncio
+import logging
+from typing import AsyncGenerator, Optional, Tuple, Any, Union
+import typing
+from langchain import ConversationChain
+from vocode.streaming.agent.base_agent import RespondAgent
+from vocode.streaming.models.agent import LlamacppAgentConfig
+from vocode.streaming.agent.utils import collate_response_async
+from langchain.callbacks.base import BaseCallbackHandler
+from langchain.callbacks.manager import CallbackManager
+from langchain.llms import LlamaCpp
+from langchain.prompts import (
+    ChatPromptTemplate,
+    MessagesPlaceholder,
+    HumanMessagePromptTemplate,
+)
+from pydantic import BaseModel
+from langchain.schema import LLMResult, SystemMessage, get_buffer_string
+from langchain.memory import ConversationBufferMemory
+from langchain.prompts import (
+    ChatPromptTemplate,
+    MessagesPlaceholder,
+    HumanMessagePromptTemplate,
+    PromptTemplate,
+)
+from langchain.prompts.base import DEFAULT_FORMATTER_MAPPING
+
+ALPACA_TEMPLATE_WITH_HISTORY = """### Instruction:
+Your previous conversation history:
+{history}
+
+Current instruction/message to respond to: {input}
+### Response:"""
+
+
+class CallbackOutput(BaseModel):
+    finish: bool = False
+    response: Optional[LLMResult] = None
+    token: str = ""
+
+
+class FormatHistoryPromptTemplate(PromptTemplate):
+    def format(self, **kwargs: Any) -> str:
+        kwargs = self._merge_partial_and_user_variables(**kwargs)
+        kwargs["history"] = get_buffer_string(kwargs["history"])
+        return DEFAULT_FORMATTER_MAPPING[self.template_format](self.template, **kwargs)
+
+
+class CustomStreamingCallbackHandler(BaseCallbackHandler):
+    def __init__(self, output_queue: asyncio.Queue) -> None:
+        super().__init__()
+        self.output_queue = output_queue
+
+    def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
+        """Run on new LLM token. Only available when streaming is enabled."""
+        self.output_queue.put_nowait(CallbackOutput(token=token))
+
+    def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
+        """Run when LLM ends running."""
+        self.output_queue.put_nowait(CallbackOutput(finish=True, response=response))
+
+
+class LlamacppAgent(RespondAgent[LlamacppAgentConfig]):
+    def __init__(
+        self,
+        agent_config: LlamacppAgentConfig,
+        logger: Optional[logging.Logger] = None,
+    ):
+        super().__init__(agent_config=agent_config, logger=logger)
+
+        self.prompt: Union[PromptTemplate, ChatPromptTemplate]
+        if type(agent_config.prompt_template) is str:
+            if agent_config.prompt_template == "alpaca":
+                self.prompt = FormatHistoryPromptTemplate(
+                    input_variables=["history", "input"],
+                    template=ALPACA_TEMPLATE_WITH_HISTORY,
+                )
+            else:
+                raise ValueError(
+                    f"Unknown prompt template {agent_config.prompt_template}"
+                )
+        else:
+            if agent_config.prompt_template is None:
+                self.prompt = ChatPromptTemplate.from_messages(
+                    [
+                        MessagesPlaceholder(variable_name="history"),
+                        HumanMessagePromptTemplate.from_template("{input}"),
+                    ]
+                )
+            else:
+                self.promt = typing.cast(PromptTemplate, agent_config.prompt_template)
+
+        self.callback_queue: asyncio.Queue = asyncio.Queue()
+        callback = CustomStreamingCallbackHandler(self.callback_queue)
+        callback_manager = CallbackManager([callback])
+        self.llm = LlamaCpp(
+            callback_manager=callback_manager, **agent_config.llamacpp_kwargs
+        )
+
+        self.memory = ConversationBufferMemory(return_messages=True)
+        self.memory.chat_memory.messages.append(
+            SystemMessage(content=self.agent_config.prompt_preamble)
+        )
+
+        self.conversation = ConversationChain(
+            memory=self.memory, prompt=self.prompt, llm=self.llm
+        )
+        self.thread_pool_executor = ThreadPoolExecutor(max_workers=1)
+
+    async def respond(
+        self,
+        human_input,
+        conversation_id: str,
+        is_interrupt: bool = False,
+    ) -> Tuple[str, bool]:
+        text = await asyncio.get_event_loop().run_in_executor(
+            self.thread_pool_executor,
+            lambda input: self.conversation.predict(input=input),
+            human_input,
+        )
+
+        self.logger.debug(f"LLM response: {text}")
+        return text, False
+
+    async def llamacpp_get_tokens(self):
+        while True:
+            callback_output = await self.callback_queue.get()
+            if callback_output.finish:
+                break
+            yield callback_output.token
+
+    async def generate_response(
+        self,
+        human_input: str,
+        conversation_id: str,
+        is_interrupt: bool = False,
+    ) -> AsyncGenerator[str, None]:
+        asyncio.get_event_loop().run_in_executor(
+            self.thread_pool_executor,
+            lambda input: self.conversation.predict(input=input),
+            human_input,
+        )
+
+        async for message in collate_response_async(
+            self.llamacpp_get_tokens(),
+        ):
+            yield str(message)
diff --git a/vocode/streaming/models/agent.py b/vocode/streaming/models/agent.py
index a4d6b8730..e6dfc5336 100644
--- a/vocode/streaming/models/agent.py
+++ b/vocode/streaming/models/agent.py
@@ -1,5 +1,6 @@
 from typing import List, Optional, Union
 from enum import Enum
+from langchain.prompts import PromptTemplate
 
 from pydantic import validator
 from vocode.streaming.models.actions import ActionConfig
@@ -30,6 +31,7 @@ class AgentType(str, Enum):
     CHAT_VERTEX_AI = "agent_chat_vertex_ai"
     ECHO = "agent_echo"
     GPT4ALL = "agent_gpt4all"
+    LLAMACPP = "agent_llamacpp"
     INFORMATION_RETRIEVAL = "agent_information_retrieval"
     RESTFUL_USER_IMPLEMENTED = "agent_restful_user_implemented"
     WEBSOCKET_USER_IMPLEMENTED = "agent_websocket_user_implemented"
@@ -108,6 +110,12 @@ class ChatVertexAIAgentConfig(AgentConfig, type=AgentType.CHAT_VERTEX_AI.value):
     generate_responses: bool = False  # Google Vertex AI doesn't support streaming
 
 
+class LlamacppAgentConfig(AgentConfig, type=AgentType.LLAMACPP.value):
+    prompt_preamble: str
+    llamacpp_kwargs: dict = {}
+    prompt_template: Optional[Union[PromptTemplate, str]] = None
+
+
 class InformationRetrievalAgentConfig(
     AgentConfig, type=AgentType.INFORMATION_RETRIEVAL.value
 ):