From 6c726e79cd99398b807fd8f158a8718e5f5844d9 Mon Sep 17 00:00:00 2001 From: Hayden Housen Date: Fri, 11 Aug 2023 19:45:22 +0000 Subject: [PATCH] Add llama.cpp agent (#254) * Add llama.cpp agent * Add some docs * Add LlamacppAgent to AgentFactory * Remove comment * Refactor duplicated sentence parsing logic * Merge ActionAgent into Base and RespondAgents and make sentence splitting vendor agnostic * Fix some mypy * factor out OpenAI token logic and complete ActionAgent merge * factor out function call logic * docs updates and cleanup/revert prompt change * pass mypy * pr changes and test * improve type hinting * add function test * add test case where the request is only func call * Use new collate_response_async function * Fix mypy * merge tests --------- Co-authored-by: zaptrem Co-authored-by: Ajay Raj --- docs/local-conversation.mdx | 42 +++++++ tests/streaming/agent/test_utils.py | 2 +- vocode/streaming/agent/factory.py | 4 + vocode/streaming/agent/llamacpp_agent.py | 148 +++++++++++++++++++++++ vocode/streaming/models/agent.py | 8 ++ 5 files changed, 203 insertions(+), 1 deletion(-) create mode 100644 vocode/streaming/agent/llamacpp_agent.py diff --git a/docs/local-conversation.mdx b/docs/local-conversation.mdx index 32fe7ac21..97aef12f5 100644 --- a/docs/local-conversation.mdx +++ b/docs/local-conversation.mdx @@ -105,6 +105,48 @@ StreamingConversation( ) ``` +## Llama.cpp + +You can use any model supported by [llama.cpp](https://github.com/ggerganov/llama.cpp) with Vocode. This includes LLaMA, Alpaca, Vicuna, Koala, WizardLM, and more. We will use [NousResearch/Nous-Hermes-13b](https://huggingface.co/NousResearch/Nous-Hermes-13b) in this example because it currently ranks highly on HuggingFace's [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). + +Our implementation is built on top of [langchain](https://python.langchain.com/docs/modules/model_io/models/llms/integrations/llamacpp), which integrates with llama.cpp through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python). + +Install `llama-cpp-python` by running the following: + +``` +pip install llama-cpp-python +``` + +or run the following to install it with support for offloading model layers to a GPU via CUDA: + +``` +CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python +``` + +[llama-cpp-python](https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal) has more installation commands for different BLAS backends. + +Set up your agent in `StreamingConversation` as follows: + +```python +from vocode.streaming.models.agent import LlamacppAgentConfig +from vocode.streaming.agent.llamacpp_agent import LlamacppAgent + +StreamingConversation( + ... + agent=LlamacppAgent( + LlamacppAgentConfig( + prompt_preamble="The AI is having a pleasant conversation about life", + llamacpp_kwargs={"model_path": "path/to/nous-hermes-13b.ggmlv3.q4_0.bin", "verbose": True}, + prompt_template="alpaca", + initial_message=BaseMessage(text="Hello!"), + ) + ) + ... +) +``` + +You can add the key `n_gpu_layers` to the `llamacpp_kwargs` to offload some of the model's layers to a GPU. + ## Coqui TTS Install the Coqui TTS package by running: diff --git a/tests/streaming/agent/test_utils.py b/tests/streaming/agent/test_utils.py index 0769954da..96cb4848e 100644 --- a/tests/streaming/agent/test_utils.py +++ b/tests/streaming/agent/test_utils.py @@ -299,7 +299,7 @@ class StreamOpenAIResponseTestCase(BaseModel): @pytest.mark.asyncio -async def test_stream_openai_response_async(): +async def test_collate_response_async(): test_cases = [ StreamOpenAIResponseTestCase( openai_objects=[ diff --git a/vocode/streaming/agent/factory.py b/vocode/streaming/agent/factory.py index 1f0c06a53..3cd0f162f 100644 --- a/vocode/streaming/agent/factory.py +++ b/vocode/streaming/agent/factory.py @@ -10,6 +10,7 @@ from vocode.streaming.agent.restful_user_implemented_agent import ( RESTfulUserImplementedAgent, ) +from vocode.streaming.agent.llamacpp_agent import LlamacppAgent from vocode.streaming.models.agent import ( AgentConfig, AgentType, @@ -19,6 +20,7 @@ InformationRetrievalAgentConfig, LLMAgentConfig, RESTfulUserImplementedAgentConfig, + LlamacppAgentConfig ) @@ -38,4 +40,6 @@ def create_agent( return RESTfulUserImplementedAgent(agent_config=agent_config, logger=logger) elif isinstance(agent_config, ChatAnthropicAgentConfig): return ChatAnthropicAgent(agent_config=agent_config, logger=logger) + elif isinstance(agent_config, LlamacppAgentConfig): + return LlamacppAgent(agent_config=agent_config, logger=logger) raise Exception("Invalid agent config", agent_config.type) diff --git a/vocode/streaming/agent/llamacpp_agent.py b/vocode/streaming/agent/llamacpp_agent.py new file mode 100644 index 000000000..10c88566f --- /dev/null +++ b/vocode/streaming/agent/llamacpp_agent.py @@ -0,0 +1,148 @@ +from concurrent.futures import ThreadPoolExecutor +import asyncio +import logging +from typing import AsyncGenerator, Optional, Tuple, Any, Union +import typing +from langchain import ConversationChain +from vocode.streaming.agent.base_agent import RespondAgent +from vocode.streaming.models.agent import LlamacppAgentConfig +from vocode.streaming.agent.utils import collate_response_async +from langchain.callbacks.base import BaseCallbackHandler +from langchain.callbacks.manager import CallbackManager +from langchain.llms import LlamaCpp +from langchain.prompts import ( + ChatPromptTemplate, + MessagesPlaceholder, + HumanMessagePromptTemplate, +) +from pydantic import BaseModel +from langchain.schema import LLMResult, SystemMessage, get_buffer_string +from langchain.memory import ConversationBufferMemory +from langchain.prompts import ( + ChatPromptTemplate, + MessagesPlaceholder, + HumanMessagePromptTemplate, + PromptTemplate, +) +from langchain.prompts.base import DEFAULT_FORMATTER_MAPPING + +ALPACA_TEMPLATE_WITH_HISTORY = """### Instruction: +Your previous conversation history: +{history} + +Current instruction/message to respond to: {input} +### Response:""" + + +class CallbackOutput(BaseModel): + finish: bool = False + response: Optional[LLMResult] = None + token: str = "" + + +class FormatHistoryPromptTemplate(PromptTemplate): + def format(self, **kwargs: Any) -> str: + kwargs = self._merge_partial_and_user_variables(**kwargs) + kwargs["history"] = get_buffer_string(kwargs["history"]) + return DEFAULT_FORMATTER_MAPPING[self.template_format](self.template, **kwargs) + + +class CustomStreamingCallbackHandler(BaseCallbackHandler): + def __init__(self, output_queue: asyncio.Queue) -> None: + super().__init__() + self.output_queue = output_queue + + def on_llm_new_token(self, token: str, **kwargs: Any) -> None: + """Run on new LLM token. Only available when streaming is enabled.""" + self.output_queue.put_nowait(CallbackOutput(token=token)) + + def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None: + """Run when LLM ends running.""" + self.output_queue.put_nowait(CallbackOutput(finish=True, response=response)) + + +class LlamacppAgent(RespondAgent[LlamacppAgentConfig]): + def __init__( + self, + agent_config: LlamacppAgentConfig, + logger: Optional[logging.Logger] = None, + ): + super().__init__(agent_config=agent_config, logger=logger) + + self.prompt: Union[PromptTemplate, ChatPromptTemplate] + if type(agent_config.prompt_template) is str: + if agent_config.prompt_template == "alpaca": + self.prompt = FormatHistoryPromptTemplate( + input_variables=["history", "input"], + template=ALPACA_TEMPLATE_WITH_HISTORY, + ) + else: + raise ValueError( + f"Unknown prompt template {agent_config.prompt_template}" + ) + else: + if agent_config.prompt_template is None: + self.prompt = ChatPromptTemplate.from_messages( + [ + MessagesPlaceholder(variable_name="history"), + HumanMessagePromptTemplate.from_template("{input}"), + ] + ) + else: + self.promt = typing.cast(PromptTemplate, agent_config.prompt_template) + + self.callback_queue: asyncio.Queue = asyncio.Queue() + callback = CustomStreamingCallbackHandler(self.callback_queue) + callback_manager = CallbackManager([callback]) + self.llm = LlamaCpp( + callback_manager=callback_manager, **agent_config.llamacpp_kwargs + ) + + self.memory = ConversationBufferMemory(return_messages=True) + self.memory.chat_memory.messages.append( + SystemMessage(content=self.agent_config.prompt_preamble) + ) + + self.conversation = ConversationChain( + memory=self.memory, prompt=self.prompt, llm=self.llm + ) + self.thread_pool_executor = ThreadPoolExecutor(max_workers=1) + + async def respond( + self, + human_input, + conversation_id: str, + is_interrupt: bool = False, + ) -> Tuple[str, bool]: + text = await asyncio.get_event_loop().run_in_executor( + self.thread_pool_executor, + lambda input: self.conversation.predict(input=input), + human_input, + ) + + self.logger.debug(f"LLM response: {text}") + return text, False + + async def llamacpp_get_tokens(self): + while True: + callback_output = await self.callback_queue.get() + if callback_output.finish: + break + yield callback_output.token + + async def generate_response( + self, + human_input: str, + conversation_id: str, + is_interrupt: bool = False, + ) -> AsyncGenerator[str, None]: + asyncio.get_event_loop().run_in_executor( + self.thread_pool_executor, + lambda input: self.conversation.predict(input=input), + human_input, + ) + + async for message in collate_response_async( + self.llamacpp_get_tokens(), + ): + yield str(message) diff --git a/vocode/streaming/models/agent.py b/vocode/streaming/models/agent.py index a4d6b8730..e6dfc5336 100644 --- a/vocode/streaming/models/agent.py +++ b/vocode/streaming/models/agent.py @@ -1,5 +1,6 @@ from typing import List, Optional, Union from enum import Enum +from langchain.prompts import PromptTemplate from pydantic import validator from vocode.streaming.models.actions import ActionConfig @@ -30,6 +31,7 @@ class AgentType(str, Enum): CHAT_VERTEX_AI = "agent_chat_vertex_ai" ECHO = "agent_echo" GPT4ALL = "agent_gpt4all" + LLAMACPP = "agent_llamacpp" INFORMATION_RETRIEVAL = "agent_information_retrieval" RESTFUL_USER_IMPLEMENTED = "agent_restful_user_implemented" WEBSOCKET_USER_IMPLEMENTED = "agent_websocket_user_implemented" @@ -108,6 +110,12 @@ class ChatVertexAIAgentConfig(AgentConfig, type=AgentType.CHAT_VERTEX_AI.value): generate_responses: bool = False # Google Vertex AI doesn't support streaming +class LlamacppAgentConfig(AgentConfig, type=AgentType.LLAMACPP.value): + prompt_preamble: str + llamacpp_kwargs: dict = {} + prompt_template: Optional[Union[PromptTemplate, str]] = None + + class InformationRetrievalAgentConfig( AgentConfig, type=AgentType.INFORMATION_RETRIEVAL.value ):