From a168966608bab4765c8bdf79ce32de818d6309aa Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Tue, 21 Nov 2023 14:44:28 -0500 Subject: [PATCH] update docs --- docs/user_guide/llmcache_03.ipynb | 210 ++++++++++++++++++++---------- redisvl/llmcache/semantic.py | 2 +- 2 files changed, 142 insertions(+), 70 deletions(-) diff --git a/docs/user_guide/llmcache_03.ipynb b/docs/user_guide/llmcache_03.ipynb index 5d4d236b..37de425b 100644 --- a/docs/user_guide/llmcache_03.ipynb +++ b/docs/user_guide/llmcache_03.ipynb @@ -6,16 +6,16 @@ "source": [ "# Semantic Caching\n", "\n", - "RedisVL provides the ``LLMCache`` interface to turn Redis, with it's vector search capability, into a semantic cache to store query results, thereby reducing the number of requests and tokens sent to the Large Language Models (LLM) service. This decreases expenses and enhances performance by reducing the time taken to generate responses.\n", + "RedisVL provides an ``SemanticCache`` interface to turn Redis into a semantic cache to store responses to previously asked questions. This reduces the number of requests and tokens sent to the Large Language Models (LLM) service, decreasing costs and enhancing application throughput (by reducing the time taken to generate responses).\n", "\n", - "This notebook will go over how to use ``LLMCache`` for your applications" + "This notebook will go over how to use Redis as a Semantic Cache for your applications" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "First, we will import OpenAI to user their API for responding to prompts." + "First, we will import [OpenAI](https://platform.openai.com) to use their API for responding to user prompts. We will also create a simple `ask_openai` helper method to assist." ] }, { @@ -34,7 +34,7 @@ "\n", "openai.api_key = api_key\n", "\n", - "def ask_openai(question):\n", + "def ask_openai(question: str) -> str:\n", " response = openai.Completion.create(\n", " engine=\"text-davinci-003\",\n", " prompt=question,\n", @@ -57,7 +57,7 @@ } ], "source": [ - "# test it\n", + "# Test\n", "print(ask_openai(\"What is the capital of France?\"))" ] }, @@ -65,9 +65,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Initializing and using ``LLMCache``\n", + "## Initializing and using ``SemanticCache``\n", "\n", - "``LLMCache`` will automatically create an index within Redis upon initialization for the semantic cache. The same ``SearchIndex`` class used in the previous tutorials is used here to perform index creation and manipulation." + "``SemanticCache`` will automatically create an index within Redis upon initialization for the semantic cache content." ] }, { @@ -76,10 +76,13 @@ "metadata": {}, "outputs": [], "source": [ - "from redisvl.llmcache.semantic import SemanticCache\n", - "cache = SemanticCache(\n", - " redis_url=\"redis://localhost:6379\",\n", - " threshold=0.9, # semantic similarity threshold\n", + "from redisvl.llmcache import SemanticCache\n", + "\n", + "llmcache = SemanticCache(\n", + " name=\"llmcache\", # underlying search index name\n", + " prefix=\"llmcache:item\", # redis key prefix\n", + " redis_url=\"redis://localhost:6379\", # redis connection url string\n", + " distance_threshold=0.1 # semantic distance threshold\n", ")" ] }, @@ -95,11 +98,11 @@ "\n", "\n", "Index Information:\n", - "╭──────────────┬────────────────┬──────────────┬─────────────────┬────────────╮\n", - "│ Index Name │ Storage Type │ Prefixes │ Index Options │ Indexing │\n", - "├──────────────┼────────────────┼──────────────┼─────────────────┼────────────┤\n", - "│ cache │ HASH │ ['llmcache'] │ [] │ 0 │\n", - "╰──────────────┴────────────────┴──────────────┴─────────────────┴────────────╯\n", + "╭──────────────┬────────────────┬───────────────────┬─────────────────┬────────────╮\n", + "│ Index Name │ Storage Type │ Prefixes │ Index Options │ Indexing │\n", + "├──────────────┼────────────────┼───────────────────┼─────────────────┼────────────┤\n", + "│ llmcache │ HASH │ ['llmcache:item'] │ [] │ 0 │\n", + "╰──────────────┴────────────────┴───────────────────┴─────────────────┴────────────╯\n", "Index Fields:\n", "╭───────────────┬───────────────┬────────╮\n", "│ Name │ Attribute │ Type │\n", @@ -111,13 +114,22 @@ ], "source": [ "# look at the index specification created for the semantic cache lookup\n", - "!rvl index info -i cache" + "!rvl index info -i llmcache" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, + "outputs": [], + "source": [ + "question = \"What is the capital of France?\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, "outputs": [ { "data": { @@ -125,118 +137,154 @@ "[]" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# check the cache\n", - "cache.check(\"What is the capital of France?\")" + "# Check the cache -- should be empty\n", + "llmcache.check(prompt=question)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "# store the question and answer\n", - "cache.store(\"What is the capital of France?\", \"Paris\")" + "# Cache the question and answer\n", + "llmcache.store(prompt=question, response=\"Paris\")" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['Paris']" + "[{'response': 'Paris', 'vector_distance': '8.34465026855e-07'}]" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# check the cache again\n", - "cache.check(\"What is the capital of France?\")" + "# Check the cache again to see if new answer is there\n", + "llmcache.check(prompt=question)" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[]" + "[{'response': 'Paris',\n", + " 'prompt': 'What is the capital of France?',\n", + " 'vector_distance': '8.34465026855e-07'}]" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# check for a semantically similar result\n", - "cache.check(\"What really is the capital of France?\")" + "# Update the return fields to gather other kinds of information about the cached entity\n", + "llmcache.check(prompt=question, return_fields=[\"response\", \"prompt\"])" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['Paris']" + "[{'response': 'Paris', 'vector_distance': '0.0988066792488'}]" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# decrease the semantic similarity threshold\n", - "cache.set_threshold(0.7)\n", - "cache.check(\"What really is the capital of France?\")" + "# Check for a semantically similar result\n", + "llmcache.check(prompt=\"What actually is the capital of France?\")" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Widen the semantic distance threshold\n", + "llmcache.set_threshold(0.3)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[]" + "[{'response': 'Paris', 'vector_distance': '0.273138523102'}]" ] }, - "execution_count": 10, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# adversarial example (not semantically similar enough)\n", - "cache.check(\"What is the capital of Spain?\")" + "# Really try to trick it by asking around the point\n", + "# But is able to slip just under our new threshold\n", + "llmcache.check(\n", + " prompt=\"What is the capital city of the country in Europe that also has a city named Nice?\"\n", + ")" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "cache.clear()" + "# Invalidate the cache completely by clearing it out\n", + "llmcache.clear()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# should be empty now\n", + "llmcache.check(prompt=question)" ] }, { @@ -245,57 +293,78 @@ "source": [ "## Performance\n", "\n", - "Next, we will measure the speedup obtained by using ``LLMCache``. We will use the ``time`` module to measure the time taken to generate responses with and without ``LLMCache``." + "Next, we will measure the speedup obtained by using ``SemanticCache``. We will use the ``time`` module to measure the time taken to generate responses with and without ``SemanticCache``." ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ - "def answer_question(question: str):\n", - " results = cache.check(question)\n", + "import time\n", + "\n", + "\n", + "def answer_question(question: str) -> str:\n", + " \"\"\"Helper function to answer a simple question using OpenAI with a wrapper\n", + " check for the answer in the semantic cache first.\n", + "\n", + " Args:\n", + " question (str): User input question.\n", + "\n", + " Returns:\n", + " str: Response.\n", + " \"\"\"\n", + " results = llmcache.check(prompt=question)\n", " if results:\n", - " return results[0]\n", + " return results[0][\"response\"]\n", " else:\n", " answer = ask_openai(question)\n", - " cache.store(question, answer)\n", " return answer" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Time taken without cache 0.574105978012085\n" + "Without caching, a call to openAI to answer this simple question took 0.32796525955200195 seconds.\n" ] } ], "source": [ - "import time\n", "start = time.time()\n", + "# asking a question -- openai response time\n", "answer = answer_question(\"What is the capital of France?\")\n", "end = time.time()\n", - "print(f\"Time taken without cache {time.time() - start}\")" + "\n", + "print(f\"Without caching, a call to openAI to answer this simple question took {end-start} seconds.\")" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "llmcache.store(prompt=\"What is the capital of France?\", response=\"Paris\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Time Taken with cache: 0.09868717193603516\n", - "Percentage of time saved: 82.81%\n" + "Time Taken with cache enabled: 0.07565498352050781\n", + "Percentage of time saved: 76.93%\n" ] } ], @@ -303,13 +372,13 @@ "cached_start = time.time()\n", "cached_answer = answer_question(\"What is the capital of France?\")\n", "cached_end = time.time()\n", - "print(f\"Time Taken with cache: {cached_end - cached_start}\")\n", + "print(f\"Time Taken with cache enabled: {cached_end - cached_start}\")\n", "print(f\"Percentage of time saved: {round(((end - start) - (cached_end - cached_start)) / (end - start) * 100, 2)}%\")" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -327,9 +396,9 @@ "│ num_records │ 2 │\n", "│ percent_indexed │ 1 │\n", "│ hash_indexing_failures │ 0 │\n", - "│ number_of_uses │ 11 │\n", + "│ number_of_uses │ 12 │\n", "│ bytes_per_record_avg │ 0 │\n", - "│ doc_table_size_mb │ 0.000134468 │\n", + "│ doc_table_size_mb │ 0.000139236 │\n", "│ inverted_sz_mb │ 0 │\n", "│ key_table_size_mb │ 2.76566e-05 │\n", "│ offset_bits_per_record_avg │ nan │\n", @@ -337,8 +406,8 @@ "│ offsets_per_term_avg │ 0 │\n", "│ records_per_doc_avg │ 2 │\n", "│ sortable_values_size_mb │ 0 │\n", - "│ total_indexing_time │ 0.087 │\n", - "│ total_inverted_index_blocks │ 11 │\n", + "│ total_indexing_time │ 0.514 │\n", + "│ total_inverted_index_blocks │ 0 │\n", "│ vector_index_sz_mb │ 3.0161 │\n", "╰─────────────────────────────┴─────────────╯\n" ] @@ -346,17 +415,20 @@ ], "source": [ "# check the stats of the index\n", - "!rvl stats -i cache" + "!rvl stats -i llmcache" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ - "# remove the index and all cached items\n", - "cache.index.delete()" + "# Clear the cache\n", + "llmcache.clear()\n", + "\n", + "# Remove the underlying index\n", + "llmcache._index.delete(drop=True)" ] } ], diff --git a/redisvl/llmcache/semantic.py b/redisvl/llmcache/semantic.py index 2c782175..300ebdd8 100644 --- a/redisvl/llmcache/semantic.py +++ b/redisvl/llmcache/semantic.py @@ -138,7 +138,7 @@ def set_threshold(self, distance_threshold: float): self._distance_threshold = float(distance_threshold) def clear(self) -> None: - """Clear the LLMCache of all keys in the index.""" + """Clear the cache of all keys while preserving the index""" with self._index.client.pipeline(transaction=False) as pipe: for key in self._index.client.scan_iter(match=f"{self._index.prefix}:*"): pipe.delete(key)