first commit

realpython · Jan 22, 2024 · 68ddf4a · 68ddf4a
1 parent 2bf9eb3
commit 68ddf4a
Show file tree

Hide file tree

Showing 32 changed files with 22,320 additions and 0 deletions.
diff --git a/langchain-rag-app/.gitignore b/langchain-rag-app/.gitignore
@@ -0,0 +1,3 @@
+chroma_data/
+*.env
+notebooks/
diff --git a/langchain-rag-app/README.md b/langchain-rag-app/README.md
@@ -0,0 +1,40 @@
+# Build a LLM RAG Chatbot With LangChain
+
+This repo contains the source code for [Build a LLM RAG Chatbot With LangChain](https://realpython.com/build-llm-rag-chatbot-with-langchain/#demo-a-llm-rag-chatbot-with-langchain-and-neo4j)
+
+Create a `.env` file in the root directory and add the following environment variables:
+
+```.env
+NEO4J_URI=<YOUR_NEO4J_URI>
+NEO4J_USERNAME=<YOUR_NEO4J_USERNAME>
+NEO4J_PASSWORD=<YOUR_NEO4J_PASSWORD>
+
+OPENAI_API_KEY=<YOUR_OPENAI_API_KEY>
+
+HOSPITALS_CSV_PATH=https://raw.githubusercontent.com/hfhoffman1144/langchain_neo4j_rag_app/main/data/hospitals.csv
+PAYERS_CSV_PATH=https://raw.githubusercontent.com/hfhoffman1144/langchain_neo4j_rag_app/main/data/payers.csv
+PHYSICIANS_CSV_PATH=https://raw.githubusercontent.com/hfhoffman1144/langchain_neo4j_rag_app/main/data/physicians.csv
+PATIENTS_CSV_PATH=https://raw.githubusercontent.com/hfhoffman1144/langchain_neo4j_rag_app/main/data/patients.csv
+VISITS_CSV_PATH=https://raw.githubusercontent.com/hfhoffman1144/langchain_neo4j_rag_app/main/data/visits.csv
+REVIEWS_CSV_PATH=https://raw.githubusercontent.com/hfhoffman1144/langchain_neo4j_rag_app/main/data/reviews.csv
+
+CHATBOT_URL=http://host.docker.internal:8000/hospital-rag-agent
+
+HOSPITAL_AGENT_MODEL=gpt-3.5-turbo-1106
+HOSPITAL_CYPHER_MODEL=gpt-3.5-turbo-1106
+HOSPITAL_QA_MODEL=gpt-3.5-turbo
+```
+
+The three `NEO4J_` variables are used to connect to your Neo4j AuraDB instance. Follow the directions [here](https://neo4j.com/cloud/platform/aura-graph-database/?ref=docs-nav-get-started) to create a free instance.
+
+The chatbot uses OpenAI LLMs, so you'll need to create an [OpenAI API key](https://realpython.com/generate-images-with-dalle-openai-api/#get-your-openai-api-key) and store it as `OPENAI_API_KEY`. 
+
+Once you have a running Neo4j instance, and have filled out all the environment variables in `.env`, you can run the entire project with [Docker Compose](https://docs.docker.com/compose/). You can install Docker Compose by following [these directions](https://docs.docker.com/compose/install/).
+
+Once you've filled in all of the environment variables, set up a Neo4j AuraDB instance, and installed Docker Compose, open a terminal and run:
+
+```console
+$ docker-compose up --build
+```
+
+After each container finishes building, you'll be able to access the chatbot api at `http://localhost:8000/docs` and the Streamlit app at `http://localhost:8501/`. 
diff --git a/langchain-rag-app/chatbot_api/Dockerfile b/langchain-rag-app/chatbot_api/Dockerfile
@@ -0,0 +1,10 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+COPY src/ /app
+
+COPY ./pyproject.toml /code/pyproject.toml
+RUN pip install /code/.
+
+EXPOSE 8000
+CMD ["sh", "entrypoint.sh"]
diff --git a/langchain-rag-app/chatbot_api/pyproject.toml b/langchain-rag-app/chatbot_api/pyproject.toml
@@ -0,0 +1,20 @@
+[project]
+name = "chatbot_api"
+version = "0.1"
+dependencies = [
+    "asyncio==3.4.3",
+    "chromadb==0.4.17",
+    "fastapi==0.109.0",
+    "langchain==0.1.0",
+    "langchain-openai==0.0.2",
+    "langchainhub==0.1.14",
+    "neo4j==5.14.1",
+    "numpy==1.26.2",
+    "openai==1.7.2",
+    "opentelemetry-api==1.22.0",
+    "pydantic==2.5.1",
+    "tiktoken==0.5.2"
+]
+
+[project.optional-dependencies]
+dev = ["black", "flake8"]
diff --git a/langchain-rag-app/chatbot_api/src/agents/hospital_rag_agent.py b/langchain-rag-app/chatbot_api/src/agents/hospital_rag_agent.py
@@ -0,0 +1,82 @@
+import os
+from langchain_openai import ChatOpenAI
+from langchain.agents import create_openai_functions_agent, Tool, AgentExecutor
+from langchain import hub
+from chains.hospital_review_chain import reviews_vector_chain
+from chains.hospital_cypher_chain import hospital_cypher_chain
+from tools.wait_times import (
+    get_current_wait_times,
+    find_most_available_hospital
+)
+
+HOSPITAL_AGENT_MODEL = os.getenv("HOSPITAL_AGENT_MODEL")
+
+hospital_agent_prompt = hub.pull("hwchase17/openai-functions-agent")
+
+tools = [
+    Tool(
+        name="Experiences",
+        func=reviews_vector_chain.invoke,
+        description="""Useful when you need to answer questions
+        about patient experiences, feelings, or any other qualitative
+        question that could be answered about a patient. Not useful
+        for answering objective questions that involve counting,
+        percentages, or any other aggregation. Use the entire prompt
+        as input to the tool. For instance, if the prompt is
+        "Are patients satisfied with their care?", the input should be
+        "Are patients satisfied with their care?".
+        """,
+    ),
+    Tool(
+        name="Graph",
+        func=hospital_cypher_chain.invoke,
+        description="""Useful for answering questions about patients,
+        physicians, hospitals, insurance payers, patient review
+        statistics, and hospital visit details. Use the entire prompt as
+        input to the tool. For instance, if the prompt is "How many visits
+        have there been?", the input should be "How many visits have
+        there been?".
+        """,
+    ),
+    Tool(
+        name="Waits",
+        func=get_current_wait_times,
+        description="""Use when asked about current wait times
+        at a specific hospital. This tool can only get the current
+        wait time at a hospital and does not have any information about
+        aggregate or historical wait times. This tool returns wait times
+        in minutes. Do not pass the word "hospital" as input, only the
+        hospital name itself. For example, if the prompt is "What is the
+        current wait time at Jordan Inc Hospital?", the input should be
+        "Jordan Inc".
+        """,
+    ),
+    Tool(
+        name="Availability",
+        func=find_most_available_hospital,
+        description="""
+        Use when you need to find out which hospital has the shortest
+        wait time. This tool does not have any information about aggregate
+        or historical wait times. This tool returns a dictionary with the
+        hospital name as the key and the wait time in minutes as the value.
+        """,
+    ),
+]
+
+chat_model = ChatOpenAI(
+    model=HOSPITAL_AGENT_MODEL,
+    temperature=0,
+)
+
+hospital_rag_agent = create_openai_functions_agent(
+    llm=chat_model,
+    prompt=hospital_agent_prompt,
+    tools=tools,
+)
+
+hospital_rag_agent_executor = AgentExecutor(
+    agent=hospital_rag_agent,
+    tools=tools,
+    return_intermediate_steps=True,
+    verbose=True,
+)
diff --git a/langchain-rag-app/chatbot_api/src/chains/hospital_cypher_chain.py b/langchain-rag-app/chatbot_api/src/chains/hospital_cypher_chain.py
@@ -0,0 +1,142 @@
+import os
+from langchain.graphs import Neo4jGraph
+from langchain.chains import GraphCypherQAChain
+from langchain_openai import ChatOpenAI
+from langchain.prompts import PromptTemplate
+
+HOSPITAL_QA_MODEL = os.getenv("HOSPITAL_QA_MODEL")
+HOSPITAL_CYPHER_MODEL = os.getenv("HOSPITAL_CYPHER_MODEL")
+
+graph = Neo4jGraph(
+    url=os.getenv("NEO4J_URI"),
+    username=os.getenv("NEO4J_USERNAME"),
+    password=os.getenv("NEO4J_PASSWORD"),
+)
+
+graph.refresh_schema()
+
+cypher_generation_template = """
+Task:
+Generate Cypher query for a Neo4j graph database.
+
+Instructions:
+Use only the provided relationship types and properties in the schema.
+Do not use any other relationship types or properties that are not provided.
+
+Schema:
+{schema}
+
+Note:
+Do not include any explanations or apologies in your responses.
+Do not respond to any questions that might ask anything other than
+for you to construct a Cypher statement. Do not include any text except
+the generated Cypher statement. Make sure the direction of the relationship is
+correct in your queries. Make sure you alias both entities and relationships
+properly. Do not run any queries that would add to or delete from
+the database. Make sure to alias all statements that follow as with
+statement (e.g. WITH v as visit, c.billing_amount as billing_amount)
+If you need to divide numbers, make sure to
+filter the denominator to be non zero.
+
+Examples:
+# Who is the oldest patient and how old are they?
+MATCH (p:Patient)
+RETURN p.name AS oldest_patient,
+       duration.between(date(p.dob), date()).years AS age
+ORDER BY age DESC
+LIMIT 1
+
+# Which physician has billed the least to Cigna
+MATCH (p:Payer)<-[c:COVERED_BY]-(v:Visit)-[t:TREATS]-(phy:Physician)
+WHERE p.name = 'Cigna'
+RETURN phy.name AS physician_name, SUM(c.billing_amount) AS total_billed
+ORDER BY total_billed
+LIMIT 1
+
+# Which state had the largest percent increase in Cigna visits
+# from 2022 to 2023?
+MATCH (h:Hospital)<-[:AT]-(v:Visit)-[:COVERED_BY]->(p:Payer)
+WHERE p.name = 'Cigna' AND v.admission_date >= '2022-01-01' AND
+v.admission_date < '2024-01-01'
+WITH h.state_name AS state, COUNT(v) AS visit_count,
+     SUM(CASE WHEN v.admission_date >= '2022-01-01' AND
+     v.admission_date < '2023-01-01' THEN 1 ELSE 0 END) AS count_2022,
+     SUM(CASE WHEN v.admission_date >= '2023-01-01' AND
+     v.admission_date < '2024-01-01' THEN 1 ELSE 0 END) AS count_2023
+WITH state, visit_count, count_2022, count_2023,
+     (toFloat(count_2023) - toFloat(count_2022)) / toFloat(count_2022) * 100
+     AS percent_increase
+RETURN state, percent_increase
+ORDER BY percent_increase DESC
+LIMIT 1
+
+# How many non-emergency patients in North Carolina have written reviews?
+match (r:Review)<-[:WRITES]-(v:Visit)-[:AT]->(h:Hospital)
+where h.state_name = 'NC' and v.admission_type <> 'Emergency'
+return count(*)
+
+String category values:
+Test results are one of: 'Inconclusive', 'Normal', 'Abnormal'
+Visit statuses are one of: 'OPEN', 'DISCHARGED'
+Admission Types are one of: 'Elective', 'Emergency', 'Urgent'
+Payer names are one of: 'Cigna', 'Blue Cross', 'UnitedHealthcare', 'Medicare',
+'Aetna'
+
+A visit is considered open if its status is 'OPEN' and the discharge date is
+missing.
+Use abbreviations when filtering on hospital states (e.g. "Texas" is "TX
+
+Make sure to use IS NULL or IS NOT NULL when analyzing missing properties.
+Never return embedding properties in your queries. You must never include the
+statement "GROUP BY" in your query. Make sure to alias all statements that
+follow as with statement (e.g. WITH v as visit, c.billing_amount as
+billing_amount)
+If you need to divide numbers, make sure to filter the denominator to be non
+zero.
+
+The question is:
+{question}
+"""
+
+cypher_generation_prompt = PromptTemplate(
+    input_variables=["schema", "question"], template=cypher_generation_template
+)
+
+qa_generation_template = """You are an assistant that takes the results
+from a Neo4j Cypher query and forms a human-readable response. The
+information section contains the results of a Cypher query that was
+generated based on a users natural language question. The provided
+information is authoritative, you must never doubt it or try to use
+your internal knowledge to correct it. Make the answer sound like a
+response to the question.
+
+Query Results:
+{context}
+
+Question:
+{question}
+
+If the provided information is empty, say you don't know the answer.
+Empty information looks like this: []
+
+If the information is not empty, you must provide an answer. If the
+question involves a time duration, assume this duration is in days
+unless otherwise specified.
+
+Helpful Answer:
+"""
+
+qa_generation_prompt = PromptTemplate(
+    input_variables=["context", "question"], template=qa_generation_template
+)
+
+hospital_cypher_chain = GraphCypherQAChain.from_llm(
+    cypher_llm=ChatOpenAI(model=HOSPITAL_CYPHER_MODEL, temperature=0),
+    qa_llm=ChatOpenAI(model=HOSPITAL_QA_MODEL, temperature=0),
+    graph=graph,
+    verbose=True,
+    qa_prompt=qa_generation_prompt,
+    cypher_prompt=cypher_generation_prompt,
+    validate_cypher=True,
+    top_k=100,
+)
diff --git a/langchain-rag-app/chatbot_api/src/chains/hospital_review_chain.py b/langchain-rag-app/chatbot_api/src/chains/hospital_review_chain.py
@@ -0,0 +1,56 @@
+import os
+from langchain.vectorstores.neo4j_vector import Neo4jVector
+from langchain_openai import OpenAIEmbeddings
+from langchain.chains import RetrievalQA
+from langchain_openai import ChatOpenAI
+from langchain.prompts import (
+    PromptTemplate,
+    SystemMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+    ChatPromptTemplate,
+)
+
+HOSPITAL_QA_MODEL = os.getenv("HOSPITAL_QA_MODEL")
+
+neo4j_vector_index = Neo4jVector.from_existing_graph(
+    embedding=OpenAIEmbeddings(),
+    url=os.getenv("NEO4J_URI"),
+    username=os.getenv("NEO4J_USERNAME"),
+    password=os.getenv("NEO4J_PASSWORD"),
+    index_name="reviews",
+    node_label="Review",
+    text_node_properties=["physician_name",
+                          "patient_name",
+                          "text",
+                          "hospital_name"],
+    embedding_node_property="embedding",
+)
+
+review_template = """Your job is to use patient
+reviews to answer questions about their experience at a hospital. Use
+the following context to answer questions. Be as detailed as possible, but
+don't make up any information that's not from the context. If you don't know
+an answer, say you don't know.
+{context}
+"""
+
+review_system_prompt = SystemMessagePromptTemplate(
+    prompt=PromptTemplate(
+        input_variables=["context"], template=review_template)
+)
+
+review_human_prompt = HumanMessagePromptTemplate(
+    prompt=PromptTemplate(input_variables=["question"], template="{question}")
+)
+messages = [review_system_prompt, review_human_prompt]
+
+review_prompt = ChatPromptTemplate(
+    input_variables=["context", "question"], messages=messages
+)
+
+reviews_vector_chain = RetrievalQA.from_chain_type(
+    llm=ChatOpenAI(model=HOSPITAL_QA_MODEL, temperature=0),
+    chain_type="stuff",
+    retriever=neo4j_vector_index.as_retriever(k=12),
+)
+reviews_vector_chain.combine_documents_chain.llm_chain.prompt = review_prompt
diff --git a/langchain-rag-app/chatbot_api/src/entrypoint.sh b/langchain-rag-app/chatbot_api/src/entrypoint.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+# Run any setup steps or pre-processing tasks here
+echo "Starting hospital RAG FastAPI service..."
+
+# Start the main application
+uvicorn main:app --host 0.0.0.0 --port 8000