-
Notifications
You must be signed in to change notification settings - Fork 83
/
pdf_handler.py
34 lines (29 loc) · 1.36 KB
/
pdf_handler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from vectordb_handler import load_vectordb
from utils import load_config, timeit
import pypdfium2
config = load_config()
def get_pdf_texts(pdfs_bytes_list):
return [extract_text_from_pdf(pdf_bytes.getvalue()) for pdf_bytes in pdfs_bytes_list]
def extract_text_from_pdf(pdf_bytes):
pdf_file = pypdfium2.PdfDocument(pdf_bytes)
return "\n".join(pdf_file.get_page(page_number).get_textpage().get_text_range() for page_number in range(len(pdf_file)))
def get_text_chunks(text):
splitter = RecursiveCharacterTextSplitter(chunk_size=config["pdf_text_splitter"]["chunk_size"],
chunk_overlap=config["pdf_text_splitter"]["overlap"],
separators=config["pdf_text_splitter"]["separators"])
return splitter.split_text(text)
def get_document_chunks(text_list):
documents = []
for text in text_list:
for chunk in get_text_chunks(text):
documents.append(Document(page_content = chunk))
return documents
@timeit
def add_documents_to_db(pdfs_bytes):
texts = get_pdf_texts(pdfs_bytes)
documents = get_document_chunks(texts)
vector_db = load_vectordb()
vector_db.add_documents(documents)
print("Documents added to db.")