-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse.rb
executable file
·47 lines (41 loc) · 1.12 KB
/
parse.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
require 'pdf-reader'
NON_DATA_STRINGS = [
'Updated March 18 , 2020',
'COVID-19 RESOURCES FOR FRANKLIN',
'COUNTY RESIDENTS',
'Compiled by Columbus Public Health',
'240 Parsons Ave',
'(614) 645-6807 or 645-3111',
'www.publichealth.columbus.gov',
'Page',
'COLUMBUS AREA FOOD PANTRIES & SOUP KITCHENS',
'Disclaimer',
'represent all the Pantries and Meals',
'information understood to'
].freeze
def main
reader = PDF::Reader.new('COVID-19 RESOURCES.pdf')
text = reader.pages.map do |page|
page.text.split(/\n+/)
.reject { |t| t.strip.empty? }
.reject { |t| NON_DATA_STRINGS.any? { |s| t[s] } }
end
text = text.map do |page|
reduce_page_to_single_column(page)
end
File.open('parsed_pdf.txt', 'w') do |f|
text.map.map { |t| f.puts t }
end
end
def reduce_page_to_single_column(page)
first_column = []
second_column = []
page.each do |line|
cols = line.split(/ \s+/)
next if cols.size > 2
first_column << cols[0].strip unless cols[0].strip.empty?
second_column << cols[1].strip unless !cols[1] || cols[1].strip.empty?
end
first_column + second_column
end
main