Updated docs for the BM25 preprocessing function

langchain-ai · Nov 14, 2024 · 47ea9dd · 47ea9dd
1 parent 4b641f8
commit 47ea9dd
Showing 1 changed file with 144 additions and 34 deletions.
diff --git a/docs/docs/integrations/retrievers/bm25.ipynb b/docs/docs/integrations/retrievers/bm25.ipynb
@@ -14,25 +14,42 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "a801b57c",
-   "metadata": {},
-   "outputs": [],
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-13T23:34:56.467044Z",
+     "start_time": "2024-11-13T23:34:54.395294Z"
+    }
+   },
    "source": [
     "%pip install --upgrade --quiet  rank_bm25"
-   ]
+   ],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "execution_count": 1
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "393ac030",
    "metadata": {
-    "tags": []
+    "tags": [],
+    "ExecuteTime": {
+     "end_time": "2024-11-13T23:35:51.348359Z",
+     "start_time": "2024-11-13T23:35:49.409254Z"
+    }
    },
-   "outputs": [],
    "source": [
     "from langchain_community.retrievers import BM25Retriever"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": 3
   },
   {
    "cell_type": "markdown",
@@ -44,15 +61,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "id": "98b1c017",
    "metadata": {
-    "tags": []
+    "tags": [],
+    "ExecuteTime": {
+     "end_time": "2024-11-13T23:35:53.096938Z",
+     "start_time": "2024-11-13T23:35:52.493243Z"
+    }
    },
-   "outputs": [],
    "source": [
     "retriever = BM25Retriever.from_texts([\"foo\", \"bar\", \"world\", \"hello\", \"foo bar\"])"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": 4
   },
   {
    "cell_type": "markdown",
@@ -66,10 +87,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "id": "53af4f00",
-   "metadata": {},
-   "outputs": [],
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-13T23:35:54.202737Z",
+     "start_time": "2024-11-13T23:35:54.198431Z"
+    }
+   },
    "source": [
     "from langchain_core.documents import Document\n",
     "\n",
@@ -82,7 +106,9 @@
     "        Document(page_content=\"foo bar\"),\n",
     "    ]\n",
     ")"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": 5
   },
   {
    "cell_type": "markdown",
@@ -96,49 +122,133 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "id": "c0455218",
    "metadata": {
-    "tags": []
+    "tags": [],
+    "ExecuteTime": {
+     "end_time": "2024-11-13T23:35:55.643026Z",
+     "start_time": "2024-11-13T23:35:55.595272Z"
+    }
    },
-   "outputs": [],
    "source": [
     "result = retriever.invoke(\"foo\")"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": 6
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "id": "7dfa5c29",
    "metadata": {
-    "tags": []
+    "tags": [],
+    "ExecuteTime": {
+     "end_time": "2024-11-13T23:35:56.122327Z",
+     "start_time": "2024-11-13T23:35:56.112647Z"
+    }
    },
+   "source": [
+    "result"
+   ],
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[Document(page_content='foo', metadata={}),\n",
-       " Document(page_content='foo bar', metadata={}),\n",
-       " Document(page_content='hello', metadata={}),\n",
-       " Document(page_content='world', metadata={})]"
+       "[Document(metadata={}, page_content='foo'),\n",
+       " Document(metadata={}, page_content='foo bar'),\n",
+       " Document(metadata={}, page_content='hello'),\n",
+       " Document(metadata={}, page_content='world')]"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
+   "execution_count": 7
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
    "source": [
-    "result"
-   ]
+    "## Preprocessing Function\n",
+    "Pass a custom preprocessing function to the retriever to improve search results. Tokenizing text at the word level can enhance retrieval, especially when using vector stores like Chroma, Pinecone, or Faiss for chunked documents."
+   ],
+   "id": "51043723814c0d68"
   },
   {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-14T00:05:19.526952Z",
+     "start_time": "2024-11-14T00:05:19.521538Z"
+    }
+   },
    "cell_type": "code",
-   "execution_count": null,
-   "id": "997aaa8d",
-   "metadata": {},
-   "outputs": [],
-   "source": []
+   "source": [
+    "import nltk\n",
+    "nltk.download('punkt_tab')"
+   ],
+   "id": "c8b25a524d11f7ab",
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt_tab to\n",
+      "[nltk_data]     C:\\Users\\Kiril\\AppData\\Roaming\\nltk_data...\n",
+      "[nltk_data]   Package punkt_tab is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 29
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-14T00:40:58.728953Z",
+     "start_time": "2024-11-14T00:40:58.722140Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "from nltk.tokenize import word_tokenize\n",
+    "\n",
+    "retriever = BM25Retriever.from_documents(\n",
+    "    [\n",
+    "        Document(page_content=\"foo\"),\n",
+    "        Document(page_content=\"bar\"),\n",
+    "        Document(page_content=\"world\"),\n",
+    "        Document(page_content=\"hello\"),\n",
+    "        Document(page_content=\"foo bar\"),\n",
+    "    ], k=2, preprocess_func=word_tokenize)\n",
+    "\n",
+    "result = retriever.invoke(\"bar\")\n",
+    "result"
+   ],
+   "id": "566fcc801cda5da4",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(metadata={}, page_content='bar'),\n",
+       " Document(metadata={}, page_content='foo bar')]"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 32
   }
  ],
  "metadata": {