Skip to content

Commit

Permalink
Add llama.cpp agent (#254)
Browse files Browse the repository at this point in the history
* Add llama.cpp agent

* Add some docs

* Add LlamacppAgent to AgentFactory

* Remove comment

* Refactor duplicated sentence parsing logic

* Merge ActionAgent into Base and RespondAgents and make sentence splitting vendor agnostic

* Fix some mypy

* factor out OpenAI token logic and complete ActionAgent merge

* factor out function call logic

* docs updates and cleanup/revert prompt change

* pass mypy

* pr changes and test

* improve type hinting

* add function test

* add test case where the request is only func call

* Use new collate_response_async function

* Fix mypy

* merge tests

---------

Co-authored-by: zaptrem <[email protected]>
Co-authored-by: Ajay Raj <[email protected]>
  • Loading branch information
3 people authored Aug 11, 2023
1 parent 8f9f02f commit 6c726e7
Show file tree
Hide file tree
Showing 5 changed files with 203 additions and 1 deletion.
42 changes: 42 additions & 0 deletions docs/local-conversation.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,48 @@ StreamingConversation(
)
```

## Llama.cpp

You can use any model supported by [llama.cpp](https://github.com/ggerganov/llama.cpp) with Vocode. This includes LLaMA, Alpaca, Vicuna, Koala, WizardLM, and more. We will use [NousResearch/Nous-Hermes-13b](https://huggingface.co/NousResearch/Nous-Hermes-13b) in this example because it currently ranks highly on HuggingFace's [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).

Our implementation is built on top of [langchain](https://python.langchain.com/docs/modules/model_io/models/llms/integrations/llamacpp), which integrates with llama.cpp through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python).

Install `llama-cpp-python` by running the following:

```
pip install llama-cpp-python
```

or run the following to install it with support for offloading model layers to a GPU via CUDA:

```
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
```

[llama-cpp-python](https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal) has more installation commands for different BLAS backends.

Set up your agent in `StreamingConversation` as follows:

```python
from vocode.streaming.models.agent import LlamacppAgentConfig
from vocode.streaming.agent.llamacpp_agent import LlamacppAgent

StreamingConversation(
...
agent=LlamacppAgent(
LlamacppAgentConfig(
prompt_preamble="The AI is having a pleasant conversation about life",
llamacpp_kwargs={"model_path": "path/to/nous-hermes-13b.ggmlv3.q4_0.bin", "verbose": True},
prompt_template="alpaca",
initial_message=BaseMessage(text="Hello!"),
)
)
...
)
```

You can add the key `n_gpu_layers` to the `llamacpp_kwargs` to offload some of the model's layers to a GPU.

## Coqui TTS

Install the Coqui TTS package by running:
Expand Down
2 changes: 1 addition & 1 deletion tests/streaming/agent/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ class StreamOpenAIResponseTestCase(BaseModel):


@pytest.mark.asyncio
async def test_stream_openai_response_async():
async def test_collate_response_async():
test_cases = [
StreamOpenAIResponseTestCase(
openai_objects=[
Expand Down
4 changes: 4 additions & 0 deletions vocode/streaming/agent/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from vocode.streaming.agent.restful_user_implemented_agent import (
RESTfulUserImplementedAgent,
)
from vocode.streaming.agent.llamacpp_agent import LlamacppAgent
from vocode.streaming.models.agent import (
AgentConfig,
AgentType,
Expand All @@ -19,6 +20,7 @@
InformationRetrievalAgentConfig,
LLMAgentConfig,
RESTfulUserImplementedAgentConfig,
LlamacppAgentConfig
)


Expand All @@ -38,4 +40,6 @@ def create_agent(
return RESTfulUserImplementedAgent(agent_config=agent_config, logger=logger)
elif isinstance(agent_config, ChatAnthropicAgentConfig):
return ChatAnthropicAgent(agent_config=agent_config, logger=logger)
elif isinstance(agent_config, LlamacppAgentConfig):
return LlamacppAgent(agent_config=agent_config, logger=logger)
raise Exception("Invalid agent config", agent_config.type)
148 changes: 148 additions & 0 deletions vocode/streaming/agent/llamacpp_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
from concurrent.futures import ThreadPoolExecutor
import asyncio
import logging
from typing import AsyncGenerator, Optional, Tuple, Any, Union
import typing
from langchain import ConversationChain
from vocode.streaming.agent.base_agent import RespondAgent
from vocode.streaming.models.agent import LlamacppAgentConfig
from vocode.streaming.agent.utils import collate_response_async
from langchain.callbacks.base import BaseCallbackHandler
from langchain.callbacks.manager import CallbackManager
from langchain.llms import LlamaCpp
from langchain.prompts import (
ChatPromptTemplate,
MessagesPlaceholder,
HumanMessagePromptTemplate,
)
from pydantic import BaseModel
from langchain.schema import LLMResult, SystemMessage, get_buffer_string
from langchain.memory import ConversationBufferMemory
from langchain.prompts import (
ChatPromptTemplate,
MessagesPlaceholder,
HumanMessagePromptTemplate,
PromptTemplate,
)
from langchain.prompts.base import DEFAULT_FORMATTER_MAPPING

ALPACA_TEMPLATE_WITH_HISTORY = """### Instruction:
Your previous conversation history:
{history}
Current instruction/message to respond to: {input}
### Response:"""


class CallbackOutput(BaseModel):
finish: bool = False
response: Optional[LLMResult] = None
token: str = ""


class FormatHistoryPromptTemplate(PromptTemplate):
def format(self, **kwargs: Any) -> str:
kwargs = self._merge_partial_and_user_variables(**kwargs)
kwargs["history"] = get_buffer_string(kwargs["history"])
return DEFAULT_FORMATTER_MAPPING[self.template_format](self.template, **kwargs)


class CustomStreamingCallbackHandler(BaseCallbackHandler):
def __init__(self, output_queue: asyncio.Queue) -> None:
super().__init__()
self.output_queue = output_queue

def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
"""Run on new LLM token. Only available when streaming is enabled."""
self.output_queue.put_nowait(CallbackOutput(token=token))

def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
"""Run when LLM ends running."""
self.output_queue.put_nowait(CallbackOutput(finish=True, response=response))


class LlamacppAgent(RespondAgent[LlamacppAgentConfig]):
def __init__(
self,
agent_config: LlamacppAgentConfig,
logger: Optional[logging.Logger] = None,
):
super().__init__(agent_config=agent_config, logger=logger)

self.prompt: Union[PromptTemplate, ChatPromptTemplate]
if type(agent_config.prompt_template) is str:
if agent_config.prompt_template == "alpaca":
self.prompt = FormatHistoryPromptTemplate(
input_variables=["history", "input"],
template=ALPACA_TEMPLATE_WITH_HISTORY,
)
else:
raise ValueError(
f"Unknown prompt template {agent_config.prompt_template}"
)
else:
if agent_config.prompt_template is None:
self.prompt = ChatPromptTemplate.from_messages(
[
MessagesPlaceholder(variable_name="history"),
HumanMessagePromptTemplate.from_template("{input}"),
]
)
else:
self.promt = typing.cast(PromptTemplate, agent_config.prompt_template)

self.callback_queue: asyncio.Queue = asyncio.Queue()
callback = CustomStreamingCallbackHandler(self.callback_queue)
callback_manager = CallbackManager([callback])
self.llm = LlamaCpp(
callback_manager=callback_manager, **agent_config.llamacpp_kwargs
)

self.memory = ConversationBufferMemory(return_messages=True)
self.memory.chat_memory.messages.append(
SystemMessage(content=self.agent_config.prompt_preamble)
)

self.conversation = ConversationChain(
memory=self.memory, prompt=self.prompt, llm=self.llm
)
self.thread_pool_executor = ThreadPoolExecutor(max_workers=1)

async def respond(
self,
human_input,
conversation_id: str,
is_interrupt: bool = False,
) -> Tuple[str, bool]:
text = await asyncio.get_event_loop().run_in_executor(
self.thread_pool_executor,
lambda input: self.conversation.predict(input=input),
human_input,
)

self.logger.debug(f"LLM response: {text}")
return text, False

async def llamacpp_get_tokens(self):
while True:
callback_output = await self.callback_queue.get()
if callback_output.finish:
break
yield callback_output.token

async def generate_response(
self,
human_input: str,
conversation_id: str,
is_interrupt: bool = False,
) -> AsyncGenerator[str, None]:
asyncio.get_event_loop().run_in_executor(
self.thread_pool_executor,
lambda input: self.conversation.predict(input=input),
human_input,
)

async for message in collate_response_async(
self.llamacpp_get_tokens(),
):
yield str(message)
8 changes: 8 additions & 0 deletions vocode/streaming/models/agent.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import List, Optional, Union
from enum import Enum
from langchain.prompts import PromptTemplate

from pydantic import validator
from vocode.streaming.models.actions import ActionConfig
Expand Down Expand Up @@ -30,6 +31,7 @@ class AgentType(str, Enum):
CHAT_VERTEX_AI = "agent_chat_vertex_ai"
ECHO = "agent_echo"
GPT4ALL = "agent_gpt4all"
LLAMACPP = "agent_llamacpp"
INFORMATION_RETRIEVAL = "agent_information_retrieval"
RESTFUL_USER_IMPLEMENTED = "agent_restful_user_implemented"
WEBSOCKET_USER_IMPLEMENTED = "agent_websocket_user_implemented"
Expand Down Expand Up @@ -108,6 +110,12 @@ class ChatVertexAIAgentConfig(AgentConfig, type=AgentType.CHAT_VERTEX_AI.value):
generate_responses: bool = False # Google Vertex AI doesn't support streaming


class LlamacppAgentConfig(AgentConfig, type=AgentType.LLAMACPP.value):
prompt_preamble: str
llamacpp_kwargs: dict = {}
prompt_template: Optional[Union[PromptTemplate, str]] = None


class InformationRetrievalAgentConfig(
AgentConfig, type=AgentType.INFORMATION_RETRIEVAL.value
):
Expand Down

0 comments on commit 6c726e7

Please sign in to comment.