-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy patheu_scraper.rb
executable file
·51 lines (38 loc) · 1.34 KB
/
eu_scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env ruby
require 'bundler/setup'
require 'rubygems'
require 'net/http'
require 'nokogiri'
require 'open-uri'
require 'fileutils'
year = 2015
document_type = 'QECR' # Parliamentary Written Questions
language = 'EN' # Language => English
max_page_num = 1
current_page = 1
# Create docs/ directory if it doesn't exist
FileUtils::mkdir_p 'docs'
# Trap Ctrl-C/SIGINT signal
interrupted = false
trap("INT") { puts "Shutting Down!"; interrupted = true }
loop do
uri = URI.parse("http://www.europarl.europa.eu/RegistreWeb/search/typedoc.htm?codeTypeDocu=#{document_type}&year=#{year}&lg=#{language}¤tPage=#{current_page}")
response = Net::HTTP.get(uri)
page = Nokogiri::HTML(response)
# Update New Max Page Number
max_page_num = page.css(".ep_paginate li a").last.attr("title").to_i
# Get Links to Questions on the current page
links = page.css(".results .notice .documents a").map { |record| record.attr("href") }
# Download Files to docs/ directory
links.each do |link|
file_name = link.split('/').last
File.open("docs/#{file_name}", "wb") do |f|
f << open(link).read
end
# If Process has been interrupted or is completed, exit the program
exit if interrupted
end
# If no more documents remain to be downloaded
break if current_page == max_page_num
end
puts "Process Successfully Completed"