-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_form.py
45 lines (38 loc) · 1.31 KB
/
parse_form.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
def create_csv_file(input_path, output_path):
text = convert_pdf_to_txt(input_path)
idx = 0
with open(output_path,'w') as file:
for line in text.split("\n"):
if line.strip():
file.write(line)
if idx % 2 == 0:
file.write(",")
else:
file.write("\n")
idx += 1
# give input path and output path
create_csv_file("submit_form_new_4.pdf", "submit_form_new_4.csv")