update docs

tylertitsworth · Dec 16, 2023 · 88b9307 · 88b9307
1 parent 4728b8d
commit 88b9307
Show file tree

Hide file tree

Showing 10 changed files with 271 additions and 40 deletions.
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,26 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
+{
+	"name": "Existing Dockerfile",
+	"build": {
+		// Sets the run context to one level up instead of the .devcontainer folder.
+		"context": "..",
+		// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
+		"dockerfile": "../Dockerfile"
+	},
+
+	// Features to add to the dev container. More info: https://containers.dev/features.
+	// "features": {},
+
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	"forwardPorts": [8000],
+
+	// Uncomment the next line to run commands after the container is created.
+	// "postCreateCommand": "cat /etc/os-release",
+
+	// Configure tool-specific properties.
+	// "customizations": {},
+
+	// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
+	// "remoteUser": "devcontainer"
+}
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,13 @@
+**__pycache__**
+_config.yml
+.env
+.devcontainer
+.git
+.github
+.gitignore
+.vscode
+data
+Dockerfile
+memory
+model
+sources
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -0,0 +1 @@
+* @tylertitsworth
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,10 @@
+version: 2
+updates:
+  - package-ecosystem: "pip" # See documentation for possible values
+    directory: "." # Location of package manifests
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "github-actions" # See documentation for possible values
+    directory: ".github/workflows" # Location of package manifests
+    schedule:
+      interval: "weekly"
diff --git a/.gitignore b/.gitignore
@@ -4,5 +4,6 @@
 **__pycache__**
 chainlit.md
 data/
+memory/
 model/
 sources/
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,12 @@
+FROM python:3.11-slim-bookworm
+
+COPY . /app
+WORKDIR /app
+
+RUN apt-get update -y && apt-get install git -y
+
+RUN pip install --no-cache-dir -r requirements.txt
+
+EXPOSE 8000
+
+CMD ["chainlit", "run", "main.py", "-w"]
diff --git a/README.md b/README.md
@@ -1,6 +1,131 @@
-# Mediawiki Chatbot
+# Multi Mediawiki RAG Chatbot
 
-1. `pip install requirements.txt`
-2. Add wiki `xml` data with `/wiki/Special:Statistics` extension
-3. Create vectorstore and test chain with `python3 main.py`
-4. Run chatbot application to localhost with `chainlit run main.py -w`
+[Chatbots](https://www.forbes.com/advisor/business/software/what-is-a-chatbot/) are very popular right now. Most openly accessible information is stored in some kind of a [Mediawiki](https://en.wikipedia.org/wiki/MediaWiki). Creating a [RAG](https://research.ibm.com/blog/retrieval-augmented-generation-RAG) Chatbot is becoming a very powerful alternative to traditional data gathering. This project is designed to create a basic format for creating your own chatbot to run locally on linux.
+
+## Table of Contents
+
+- [Multi Mediawiki RAG Chatbot](#multi-mediawiki-rag-chatbot)
+  - [Table of Contents](#table-of-contents)
+  - [About](#about)
+    - [Architecture](#architecture)
+    - [Filesystem](#filesystem)
+    - [System Prompt](#system-prompt)
+  - [Quickstart](#quickstart)
+    - [Prerequisites](#prerequisites)
+    - [Create Vector Database](#create-vector-database)
+      - [Expected Output](#expected-output)
+    - [Start Chatbot](#start-chatbot)
+
+## About
+
+[Mediawikis](https://en.wikipedia.org/wiki/MediaWiki) hosted by [Fandom](https://www.fandom.com/) usually allow you to download an XML dump of the entire wiki as it currently exists. This project primarily leverages [Langchain](https://github.com/langchain-ai/langchain) with a few other open source projects to combine many of the readily available quickstart guides into a complete vertical application based on mediawiki data.
+
+### Architecture
+
+```mermaid
+graph TD;
+    Huggingface --Sentence-Transformer --> db
+    Ollama --llama2--> Model --> Langchain
+    Huggingface --any-llm--> Model
+    cache[(cache)] <--sqlite3--> Langchain
+    xml-dump-a --MWDumpLoader--> Text-Splitter
+    xml-dump-b --MWDumpLoader--> Text-Splitter
+    xml-dump-c --MWDumpLoader--> Text-Splitter
+    Text-Splitter --> db
+    db[(Chroma)] --Retriever--> Langchain
+    Memory <--Chat-History--> Langchain
+    Prompt --DocumentQA--> Langchain
+    Langchain <-.-> id(((Chainlit)))
+    click db href "https://github.com/chroma-core/chroma"
+    click Huggingface href "https://huggingface.co/"
+    click id href "https://github.com/Chainlit/chainlit"
+    click Langchain href "https://github.com/langchain-ai/langchain"
+    click Ollama href "https://github.com/jmorganca/ollama"
+    click sqlite3 href "https://www.sqlite.org/index.html"
+```
+
+### Filesystem
+
+```text
+multi-mediawiki-rag
+├── .chainlit
+│   ├── .langchain.db
+│   └── config.toml
+├── .env
+├── Dockerfile
+├── chainlit.md
+├── config.yaml
+├── data
+│   ├── *
+│   └── chroma.sqlite3
+├── main.py
+├── memory
+│   └── cache.db
+├── model
+│   └── sentence-transformers_all-MiniLM-L6-v2
+│       └── *
+├── requirements.txt
+└── sources
+    └── <wikiname>_pages_current.xml
+```
+
+### System Prompt
+
+```text
+Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES"). 
+If you don't know the answer, just say that you don't know. Don't try to make up an answer.
+ALWAYS return a "SOURCES" part in your answer.
+---
+Content: {context}
+---
+```
+
+## Quickstart
+
+These instructions will get you a copy of the project up and running on your local machine for development and testing purposes.
+
+### Prerequisites
+
+These steps assume you are using a modern Linux OS like Ubuntu with Python 3.
+
+1. Download a mediawiki's XML dump by browsing to `/wiki/Special:Statistics`.
+2. Edit [`config.yaml`](config.yaml) with the location of your XML mediawiki data, wiki name, and example prompt to test on the wiki.
+   1. You can choose to download your LLM during runtime from [Huggingface]("https://huggingface.co/") or locally before with [Ollama](https://github.com/jmorganca/ollama). (`ollama pull llama2`)
+3. Install python requirements:
+
+```bash
+pip install -r requirements.txt
+```
+
+### Create Vector Database
+
+Your XML data needs to be loaded and transformed into embeddings to create a [Chroma](https://python.langchain.com/docs/integrations/vectorstores/chroma) VectorDB.
+
+```bash
+python main.py
+```
+
+>**Note:** Use an existing vectorDB by adding `--no-embed`
+
+#### Expected Output
+
+- Prompt: "What is a Tako?"
+
+```text
+2023-12-15 22:09:21 - Loaded .env file
+2023-12-15 22:09:24 - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
+2023-12-15 22:09:25 - Use pytorch device: cpu
+2023-12-15 22:13:49 - Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry
+for more information.
+Batches: 100%|███████████████████████████████████████████████| 1303/1303 [10:28<00:00,  2.07it/s] 
+...
+
+```
+
+### Start Chatbot
+
+```bash
+chainlit run main.py -w
+```
+
+Access the Chatbot GUI at `http://localhost:8000`.
diff --git a/config.yaml b/config.yaml
@@ -0,0 +1,8 @@
+mediawikis:
+  - dungeons
+  - eberron
+  - forgottenrealms
+  - planescape
+prompt: "What is a Tako?"
+model: null # "Intel/neural-chat-7b-v3-3"
+source: ./sources
diff --git a/main.py b/main.py
@@ -1,34 +1,54 @@
+from langchain.cache import SQLiteCache
 from langchain.callbacks.manager import CallbackManager
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.chains import ConversationalRetrievalChain
 from langchain.document_loaders import MWDumpLoader
 from langchain.document_loaders.merge import MergedDataLoader
 from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.globals import set_llm_cache
+from langchain.llms import Ollama
 from langchain.llms.huggingface_pipeline import HuggingFacePipeline
 from langchain.memory import ChatMessageHistory, ConversationBufferMemory
 from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import Chroma
+from sys import exit
 
+import argparse
 import chainlit as cl
+import yaml
 
-# from dotenv import load_dotenv
-# load_dotenv()
+class MultiWiki:
+    def __init__(self):
+        try:
+            with open('config.yaml', 'r', encoding='utf-8') as file:
+                data = yaml.safe_load(file)
+        except FileNotFoundError:
+            print("Error: File config.yaml not found.")
+        except yaml.YAMLError as e:
+            print(f"Error reading YAML file: {e}")
+
+        for key, val in data.items():
+            if key == 'mediawikis':
+                self.wikis = {wiki: "" for wiki in data['mediawikis']}
+            else:
+                setattr(self, key, val)
+        parser = argparse.ArgumentParser()
+        parser.add_argument('--no-embed', dest='embed', action='store_false')
+        self.args = parser.parse_args()
+
+def create_vector_db(source, wikis):
+    if not source:
+        print("No data sources found")
+        exit(1)
 
-def create_vector_db():
     # https://python.langchain.com/docs/integrations/text_embedding/huggingfacehub
     embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", cache_folder="./model")
 
-    wikis = {
-        "dungeons": "",
-        "eberron": "",
-        "forgottenrealms": "",
-        "planescape": "",
-    }
     for wiki in wikis.keys():
         # https://python.langchain.com/docs/integrations/document_loaders/mediawikidump
         wikis[wiki] = MWDumpLoader(
-            file_path=f"sources/{wiki}_pages_current.xml",
+            file_path=f"{source}/{wiki}_pages_current.xml",
             encoding="utf-8",
             skip_redirects=True,
             stop_on_error=False
@@ -47,12 +67,12 @@ def create_vector_db():
     )
     vectordb.persist()
 
-def create_chain():
+def create_chain(model):
     system_prompt="""
 Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES"). 
 If you don't know the answer, just say that you don't know. Don't try to make up an answer.
 ALWAYS return a "SOURCES" part in your answer.
-----
+---
 Content: {context}
 ---
 """
@@ -75,18 +95,28 @@ def create_chain():
     )
     # https://python.langchain.com/docs/integrations/text_embedding/huggingfacehub
     embeddings = HuggingFaceEmbeddings(
-        model_name="sentence-transformers/all-MiniLM-L6-v2", cache_folder="./model"
+        model_name="sentence-transformers/all-MiniLM-L6-v2",
+        cache_folder="./model"
     )
     vectordb = Chroma(persist_directory="data", embedding_function=embeddings)
     callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
-    # https://python.langchain.com/docs/integrations/llms/huggingface_pipelines
-    model = HuggingFacePipeline.from_model_id(
-        model="Intel/neural-chat-7b-v3-1",
-        task="text-generation",
-        pipeline_kwargs={"max_new_tokens": 10},
-        verbose=False,
-        callback_manager=callback_manager,
-    )
+    # https://python.langchain.com/docs/integrations/llms/llm_caching
+    set_llm_cache(SQLiteCache(database_path="memory/cache.db"))
+    if model:
+        # https://python.langchain.com/docs/integrations/llms/huggingface_pipelines
+        model = HuggingFacePipeline.from_model_id(
+            model_id=model,
+            cache=True,
+            callback_manager=callback_manager,
+            pipeline_kwargs={"max_new_tokens": 10},
+            task="text-generation",
+        )
+    else:
+        # https://python.langchain.com/docs/integrations/llms/ollama
+        model = Ollama(
+            model="llama2",
+            callback_manager=callback_manager,
+        )
     # https://api.python.langchain.com/en/latest/chains/langchain.chains.conversational_retrieval.base.ConversationalRetrievalChain.html
     chain = ConversationalRetrievalChain.from_llm(
         llm=model,
@@ -96,13 +126,15 @@ def create_chain():
         combine_docs_chain_kwargs={"prompt": prompt},
         return_source_documents=True
     )
+
     return chain
 
 # https://docs.chainlit.io/integrations/langchain
 # https://docs.chainlit.io/examples/qa
 @cl.on_chat_start
 async def on_chat_start():
-    chain = create_chain()
+    wiki = MultiWiki()
+    chain = create_chain(wiki.model)
     cl.user_session.set("chain", chain)
 
 
@@ -136,10 +168,14 @@ async def on_message(message: cl.Message):
 
     await cl.Message(content=answer, elements=text_elements).send()
 
-
 if __name__ == "__main__":
-    create_vector_db()
-    chain = create_chain()
-    res = chain("List every octopus monster in the forgotten realms")
+    wiki = MultiWiki()
+    if wiki.args.embed:
+        create_vector_db(wiki.source, wiki.wikis)
+    chain = create_chain(wiki.model)
+    if not wiki.prompt:
+        print("No Prompt for Chatbot found")
+        exit(1)
+    res = chain(wiki.prompt)
     answer = res["answer"]
     print([source_doc.page_content for source_doc in res["source_documents"]])
diff --git a/requirements.txt b/requirements.txt
@@ -1,12 +1,11 @@
-# openai
-chainlit
-chromadb
 # https://github.com/mediawiki-utilities/python-mwxml/pull/19
 git+https://github.com/gdedrouas/python-mwxml@xml_format_0.11
 git+https://github.com/mediawiki-utilities/python-mwtypes@updates_schema_0.11
-jq
-langchain
-mwparserfromhell
-sentence-transformers
-tiktoken
-torch[cpu]
+chainlit==0.7.700
+chromadb==0.4.20
+httptools==0.6.1
+langchain==0.0.350
+mwparserfromhell==0.6.5
+sentence-transformers==2.2.2
+tiktoken==0.5.2
+uvloop==0.19.0