forked from openaustralia/openaustralia-parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwikipedia.rb
executable file
·41 lines (33 loc) · 1.46 KB
/
wikipedia.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/env ruby
# Figures out the URLs for the Wikipedia biography pages of Representatives and Senators
$:.unshift "#{File.dirname(__FILE__)}/lib"
require 'name'
require 'people'
require 'mechanize_proxy'
require 'configuration'
require 'extract_wikipedia_links'
def write_links(links, filename)
xml = File.open(filename, 'w')
x = Builder::XmlMarkup.new(:target => xml, :indent => 1)
x.instruct!
x.peopleinfo do
links.each { |link| x.personinfo(:id => link[0], :wikipedia_url => link[1]) }
end
xml.close
end
conf = Configuration.new
puts "Reading member data..."
people = people = PeopleCSVReader.read_members
agent = MechanizeProxy.new
# Slightly naughty because Wikipedia specifically blocks Ruby Mechanize but I'm justifying it because we
# are using the html_cache here so that will mean there is a very small amount of traffic generally
agent.user_agent_alias = 'Mac Safari'
agent.cache_subdirectory = "wikipedia"
puts "Wikipedia links for Representatives..."
links = extract_all_representative_wikipedia_links(people, agent)
write_links(links, "#{conf.members_xml_path}/wikipedia-commons.xml")
# For Representatives just for curiousity sake find out which has a link back to OpenAustralia
links.each {|link| check_wikipedia_page(link[1], agent) }
puts "Wikipedia links for Senators..."
write_links(extract_all_senator_wikipedia_links(people, agent), "#{conf.members_xml_path}/wikipedia-lords.xml")
system(conf.web_root + "/twfy/scripts/mpinfoin.pl links")