-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathRakefile
226 lines (188 loc) · 7.11 KB
/
Rakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
$:.unshift File.expand_path(File.join(File.dirname(__FILE__), 'lib'))
require 'scrape'
require 'rake/testtask'
require 'json'
require 'pry'
require 'fileutils'
require 'parallel'
CALENDAR_URI = "https://ratsinfo.dresden.de/si0040.asp?__cjahr=%d&__cmonat=%s"
SESSION_URI = "https://ratsinfo.dresden.de/si0050.asp?__ksinr=%d"
DOWNLOAD_PATH = ENV["DOWNLOAD_PATH"] || File.join(File.dirname(__FILE__), "data")
VORLAGEN_LISTE_URI = "https://ratsinfo.dresden.de/vo0042.asp?__cwpall=1"
VORLAGE_URI = "https://ratsinfo.dresden.de/vo0050.asp?__kvonr=%s"
ANFRAGEN_LISTE_URI = "https://ratsinfo.dresden.de/ag0080.asp?smcadat=2458000"
ANFRAGE_URI = "https://ratsinfo.dresden.de/ag0050.asp?__kagnr=%s"
GREMIEN_LISTE_URI = "https://ratsinfo.dresden.de/gr0040.asp?__cwpall=1&"
PERSON_INFO_URI = "https://ratsinfo.dresden.de/pe0051.asp?__cwpall=1&__kpenr=%d"
PERSON_GREMIEN_URI = "https://ratsinfo.dresden.de/kp0050.asp?__cwpall=1&__kpenr=%d"
FILE_URI = "https://ratsinfo.dresden.de/getfile.asp?id=%s&type=do"
CONCURRENCY = 16
directory DOWNLOAD_PATH
scrape_start_date = Date.new(2009, 8)
scrape_end_date = Time.now.to_date
task :default => [:scrape_sessions]
desc "Scrape Documents from https://ratsinfo.dresden.de with a minmal timerange"
task :testmonth do
scrape_start_date = Date.new(2019, 9)
scrape_end_date = Date.new(2019, 10)
Rake::Task["scrape_sessions"].invoke
end
desc "Scrape Documents from https://ratsinfo.dresden.de"
task :scrape => [
:scrape_gremien,
:scrape_people,
:scrape_anfragen, :scrape_vorlagen,
:scrape_sessions,
:fetch_meetings_anfragen,
:fetch_files
]
task :scrape_gremien do
Scrape::GremienListeScraper.new(GREMIEN_LISTE_URI).each do |organization|
puts "[#{organization.id}] #{organization.name}"
organization.save_to File.join(DOWNLOAD_PATH, "gremien", "#{organization.id}.json")
end
end
task :scrape_people do
Scrape::PeopleScraper.new(PERSON_INFO_URI, PERSON_GREMIEN_URI).each do |person|
puts "[#{person.id}] #{person.name}"
person.save_to File.join(DOWNLOAD_PATH, "persons", "#{person.id}.json")
end
end
task :test_session do
session_url = sprintf(SESSION_URI, 7548)
meeting = Scrape::SessionScraper.new(session_url).scrape
require 'pp'
pp meeting
end
task :scrape_sessions do
raise "download path '#{DOWNLOAD_PATH}' does not exists!" unless Dir.exists?(DOWNLOAD_PATH)
date_range = (scrape_start_date..scrape_end_date).select {|d| d.day == 1}
date_range.each do |date|
uri = sprintf(CALENDAR_URI, date.year, date.month)
s = Scrape::ConferenceCalendarScraper.new(uri)
Parallel.each(s, :in_processes => CONCURRENCY) do |session_id|
session_url = sprintf(SESSION_URI, session_id)
meeting = Scrape::SessionScraper.new(session_url).scrape
meeting.id = session_id
puts "[#{meeting.id}] #{meeting.name}"
meeting.save_to File.join(DOWNLOAD_PATH, "meetings", "#{meeting.id}.json")
# meeting.persons.each do |person|
# persons_path = File.join(DOWNLOAD_PATH, "persons", "#{person.id}.json")
# person.save_to persons_path
# end
meeting.files.each do |file|
file.save_to File.join(DOWNLOAD_PATH, "files", "#{file.id}.json")
end
end
end
end
task :test_vorlage do
id = 13658
paper = Scrape::PaperScraper.new(sprintf(VORLAGE_URI, id)).scrape
require 'pp'
pp paper
end
task :scrape_vorlagen do
Parallel.each(Scrape::VorlagenListeScraper.new(VORLAGEN_LISTE_URI), :in_processes => CONCURRENCY) do |paper|
id = paper.id
paper = Scrape::PaperScraper.new(sprintf(VORLAGE_URI, id)).scrape
paper.id = id # Restore id
puts "Vorlage #{paper.id} [#{paper.shortName}] #{paper.name}"
paper.save_to File.join(DOWNLOAD_PATH, "vorlagen", "#{paper.id}.json")
paper.files.each do |file|
file.save_to File.join(DOWNLOAD_PATH, "files", "#{file.id}.json")
end
end
end
task :scrape_anfragen do
Parallel.each(Scrape::AnfragenListeScraper.new(ANFRAGEN_LISTE_URI), :in_processes => CONCURRENCY) do |paper|
id = paper.id
paper = Scrape::PaperScraper.new(sprintf(ANFRAGE_URI, id)).scrape
paper.id = id # Restore id
puts "Anfrage #{paper.id} [#{paper.shortName}] #{paper.name}"
paper.save_to File.join(DOWNLOAD_PATH, "anfragen", "#{paper.id}.json")
paper.files.each do |file|
file.save_to File.join(DOWNLOAD_PATH, "files", "#{file.id}.json")
end
end
end
desc "Durchsucht alle Meetings nach weiteren Anfragen"
task :fetch_meetings_anfragen do
Dir.glob(File.join(DOWNLOAD_PATH, "meetings", "*.json")) do |filename|
meeting = OParl::Meeting.load_from(filename)
Parallel.each(meeting.agendaItem, :in_processes => CONCURRENCY) do |agenda_item|
consultation = agenda_item.consultation
next unless consultation
id = consultation.paper
paper_path = File.join(DOWNLOAD_PATH, "vorlagen", "#{id}.json")
paper = Scrape::PaperScraper.new(sprintf(VORLAGE_URI, id)).scrape
paper.id = id # Restore id
puts "Vorlage #{paper.id} [#{paper.shortName}] #{paper.name}"
paper.save_to paper_path
paper.files.each do |file|
file.save_to File.join(DOWNLOAD_PATH, "files", "#{file.id}.json")
end
end
end
end
desc "Ensure all known PDF files are fetched, even those not included in Meetings but referenced by Papers"
task :fetch_files do
path = File.join(DOWNLOAD_PATH, "files")
Dir.foreach path do |filename|
next unless filename =~ /(.+)\.json$/
id = $1
json_path = File.join(path, filename)
file = OParl::File.load_from(json_path)
file.downloadUrl = sprintf(FILE_URI, id)
pdf_path = File.join(path, "#{id}.pdf")
# We do want to download existing files to monitor for updates,
# but not at once. Distribute over 1 month:
if File.exist? pdf_path and id.to_i % 29 != Time.new.day - 1
next
end
puts "Fetch file #{id}: #{file.name}"
begin
tmp_file = Scrape.download_file(file.downloadUrl)
file.fileName = tmp_file.file_name
file.mimeType = tmp_file.mime_type
file.size = tmp_file.size
FileUtils.mv tmp_file.path, pdf_path
tmp_file.close
tmp_file = nil
rescue
puts "Error downloading #{file.downloadUrl}"
puts $!
next
ensure
tmp_file.unlink if tmp_file.is_a? File
end
if `sha1sum #{pdf_path}` =~ /([0-9a-f]+)/
file.sha1Checksum = $1
end
file.save_to json_path
end
end
desc "Create plain-text versions of all PDF files, but no other mime-types"
task :pdftotext do
path = File.join(DOWNLOAD_PATH, "files")
Dir.foreach path do |filename|
next unless filename =~ /(.+)\.json$/
id = $1
json_path = File.join(path, filename)
file = OParl::File.load_from(json_path)
if file.mimeType == 'application/pdf'
pdf_path = File.join(path, "#{id}.pdf")
txt_path = File.join(path, "#{id}.txt")
# Only if .pdf newer than .txt
if !File.exist?(txt_path) or File.new(pdf_path).mtime > File.new(txt_path).mtime
puts pdf_path
output = `pdftotext -enc UTF-8 #{pdf_path}`.chomp
puts output unless output.empty?
end
end
end
end
Rake::TestTask.new do |t|
t.libs << "test"
t.test_files = FileList['test/**/*_test.rb']
end