Skip to content

Commit

Permalink
Full knowledge base management and dialog capabilities!
Browse files Browse the repository at this point in the history
Now, through the sidebar, You  can locally create a knowledge base, view its contents, and perform file operations such as adding and deleting files.
Besides, you can choose to have LLM search the entire database or just talk and ask questions about specific files.
  • Loading branch information
Wannabeasmartguy committed Oct 18, 2023
1 parent 321d331 commit a426764
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 63 deletions.
52 changes: 21 additions & 31 deletions GPT-Gradio-Agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from langchain.chat_models import AzureChatOpenAI
from langchain.document_loaders import DirectoryLoader,PyPDFLoader,UnstructuredFileLoader
from langchain.chains import RetrievalQA
from langchain.chains.summarize import load_summarize_chain

load_dotenv()

Expand Down Expand Up @@ -158,21 +157,6 @@ def file_ask_stream(file_ask_history_list:list[list],file_answer:list):
time.sleep(0.02)
yield file_ask_history_list

def summarize_file(split_docs,chatbot,model_choice,sum_type):
llm = AzureChatOpenAI(model=model_choice,
openai_api_type="azure",
deployment_name=model_choice, # <----------设置选择模型的时候修改这里
temperature=0.7)
# 创建总结链
chain = load_summarize_chain(llm, chain_type=sum_type, verbose=True)

# 执行总结链
summarize_result = chain.run(split_docs[-1])

# 构造 chatbox 格式
chatbot.append(["Please summarize the file for me.",None])
return summarize_result,chatbot

def sum_stream(summarize_result,chatbot):
'''
Used to make summarized result be outputed as stream.
Expand Down Expand Up @@ -201,7 +185,7 @@ def rst_mem(chat_his:list):
usr_msg = gr.State()
chat_his = gr.State([])
with gr.Row():
with gr.Column(scale=1.8):
with gr.Column(scale=2):
model_choice = gr.Radio(choices=["gpt-35-turbo","gpt-35-turbo-16k","gpt-4"],
value="gpt-35-turbo",
label="Model",info="支持模型选择,立即生效")
Expand All @@ -210,9 +194,12 @@ def rst_mem(chat_his:list):
bubble_full_width=False)
message = gr.Textbox(label="Input your prompt",
info="'Shift + Enter' to begin an new line. Press 'Enter' can also send your Prompt to the LLM.")
with gr.Row(scale=0.1):
with gr.Row():
clear = gr.ClearButton([message, chat_bot,chat_his],scale=1,size="sm")
send = gr.Button("Send",scale=2)
with gr.Row():
chat_with_file = gr.Button(value="Chat with file (Valid for knowledge base)")
summarize = gr.Button(value="Summarize (Valid only for uploaded file)")

with gr.Column():
with gr.Tab("Chat"):
Expand Down Expand Up @@ -247,25 +234,28 @@ def rst_mem(chat_his:list):
vector_path = gr.Text(label="Knowledge base save path",
info="Choose the folder you want to save, and PASTE THE ABSOLUTE PATH here")
with gr.Row():
vector_content = gr.DataFrame(label="Knowledge Base Document Catalog",
interactive=False,
)
vector_content = gr.DataFrame(#label="Knowledge Base Document Catalog",
value = pd.DataFrame(columns=['文件名称']),
interactive=False,
)
file_list = gr.Dropdown(interactive=True,
# allow_custom_value=True,
label="File list")
with gr.Column():
create_vec_but = gr.Button(value="Create a new knowledge base")
load_vec = gr.Button(value="Load your knowledge base")
with gr.Row():
add_file = gr.Button(value="Add it(The file uploaded) to knowledge base")
delete_file = gr.Button(value="Delete it(selected in dropdown) from knowledge base")
sum_type = gr.Radio(choices=[("小文件(file with few words)","stuff"),("大文件(file with a large word count)","refine")],
value="stuff",
label="Choose the type of file to be summarized",
info="如果待总结字数较多,请选择“大文件”(选小文件可能导致超出 GPT 的最大 Token )")
with gr.Row():
chat_with_file = gr.Button(value="Chat with file")
summarize = gr.Button(value="Summarize")
add_file = gr.Button(value="Add it (The file uploaded) to knowledge base")
delete_file = gr.Button(value="Delete it (Selected in dropdown) from knowledge base")
with gr.Accordion("File chat setting"):
filter_choice = gr.Radio(choices=["All", "Selected file"],
value="All",
label="Search scope",
info="“All” means whole knowledge base;“Selected file” means the file selected in dropdown")
sum_type = gr.Radio(choices=[("small file","stuff"),("large file","refine")],
value="stuff",
label="File size type",
info="也作用于“Summarize”。如果待总结字数较多,请选择“lagre size”(选“large size”可能导致超出 GPT 的最大 Token )")

# Merge all handles that require input and output.
input_param = [message, model_choice, chat_his, chat_bot, System_Prompt,
Expand All @@ -283,7 +273,7 @@ def rst_mem(chat_his:list):

# chat_file button event
file.upload(upload_file,inputs=[file,split_tmp],outputs=[split_tmp,file],show_progress="full")
chat_with_file.click(ask_file,inputs=[chat_bot,message,file_answer,model_choice,sum_type,vector_path,file_list],outputs=[chat_bot,file_answer]).then(file_ask_stream,[chat_bot,file_answer],[chat_bot])
chat_with_file.click(ask_file,inputs=[chat_bot,message,file_answer,model_choice,sum_type,vector_path,file_list,filter_choice],outputs=[chat_bot,file_answer]).then(file_ask_stream,[chat_bot,file_answer],[chat_bot])
summarize.click(summarize_file,inputs=[split_tmp,chat_bot,model_choice,sum_type],outputs=[sum_result,chat_bot]).then(sum_stream,[sum_result,chat_bot],[chat_bot])

chat_with_file.click(lambda: gr.update(value=''), [],[message])
Expand Down
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,11 @@ Then use `pip install -r requirements.txt` on the Command Prompt to install the

- [x] Local knowledge base management

- [ ] Chat with whole knowledge base

- [ ] Local storage of data
- [x] Chat with whole knowledge base

- [ ] List citation sources

- [ ] Estimated cost of embedding files

- [ ] Import and export chat history

Expand Down
10 changes: 5 additions & 5 deletions README_zh-cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,13 @@

- [x] 文件全文总结

- [x] 知识库本地存储

- [x] 知识库本地管理

- [ ] 多文件对话

- [ ] 数据本地存储
- [x] 知识库全局检索与对话

- [ ] 显示引用来源

- [ ] 预估嵌入文件的费用

- [ ] 聊天记录导入、导出

Expand Down
50 changes: 26 additions & 24 deletions vecstore/vecstore.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@

from langchain.chains.summarize import load_summarize_chain
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import AzureChatOpenAI
Expand Down Expand Up @@ -148,26 +148,14 @@ def refresh_file_list(df):
gr.Info('Successfully update kowledge base.')
return gr.Dropdown.update(choices=file_list)

def find_source_paths(filenames, data):
'''
Retrieve file paths in a vector database based on file name and remove duplicate paths
'''
paths = []
for metadata in data['metadatas']:
source = metadata.get('source')
if source:
for filename in filenames:
if filename in source and source not in paths:
paths.append(source)
return paths

def ask_file(file_ask_history_list:list,
question_prompt: str,
file_answer:list,
model_choice:str,
sum_type:str,
persist_vec_path,
file_list
file_list,
filter_type:str,
):
'''
send splitted file to LLM
Expand All @@ -178,11 +166,11 @@ def ask_file(file_ask_history_list:list,
temperature=0.7)

source_data = vectorstore.get()
filter_goal = find_source_paths(filenames=file_list,data=source_data)
filter_goal = find_source_paths(file_list,source_data)

if persist_vec_path != None:
# docsearch = Chroma.from_documents(split_docs[-1], embeddings)
if file_list == "Unselect file(s)" or file_list != None:
# Codes here in "if" may be deleted or modified later
if filter_type == "All":
# unselect file: retrieve whole knowledge base
try:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type=sum_type,
Expand All @@ -191,7 +179,7 @@ def ask_file(file_ask_history_list:list,
result = qa({"query": question_prompt})
except (NameError):
raise gr.Error("You have not load kownledge base yet.")
else:
elif filter_type == "Selected file":
# only selected one file
# Retrieve the specified knowledge base with filter
qa = RetrievalQA.from_chain_type(llm=llm, chain_type=sum_type,
Expand All @@ -211,15 +199,29 @@ def ask_file(file_ask_history_list:list,
file_ask_history_list.append([usr_prob,None])
return file_ask_history_list,file_answer

def find_source_paths(filenames:list, data):
def summarize_file(split_docs,chatbot,model_choice,sum_type):
llm = AzureChatOpenAI(model=model_choice,
openai_api_type="azure",
deployment_name=model_choice, # <----------设置选择模型的时候修改这里
temperature=0.7)
# 创建总结链
chain = load_summarize_chain(llm, chain_type=sum_type, verbose=True)

# 执行总结链
summarize_result = chain.run(split_docs[-1])

# 构造 chatbox 格式
chatbot.append(["Please summarize the file for me.",None])
return summarize_result,chatbot

def find_source_paths(filename:str, data:dict):
"""
Find the source paths of the files in the knowledge base.
return --> list
"""
paths = []
for metadata in data['metadatas']:
source = metadata.get('source')
if source:
for filename in filenames:
if filename in source and source not in paths:
paths.append(source)
if source and filename in source and source not in paths:
paths.append(source)
return paths

0 comments on commit a426764

Please sign in to comment.