From f2573d0f3a275c2968ea0221e7200eab1a742f3b Mon Sep 17 00:00:00 2001 From: takuro Date: Sat, 5 Apr 2014 22:41:47 +0900 Subject: [PATCH 1/4] Support other charsets --- CHANGELOG.rdoc | 6 ++++++ Rakefile | 4 ++-- lib/anemone/page.rb | 17 ++++++++++++++++- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.rdoc b/CHANGELOG.rdoc index 9aeb3793..b37cd06e 100644 --- a/CHANGELOG.rdoc +++ b/CHANGELOG.rdoc @@ -1,3 +1,9 @@ +== 0.7.2-fork / 2014-04-05 + +* Minor enhancements + + * Support other charsets + == 0.7.2 / 2012-05-30 * Bug fixes diff --git a/Rakefile b/Rakefile index 81d0b581..0596ff54 100644 --- a/Rakefile +++ b/Rakefile @@ -1,5 +1,5 @@ require 'rspec/core/rake_task' -require 'rake/rdoctask' +require 'rdoc/task' desc "Run all specs" RSpec::Core::RakeTask.new(:rspec) do |spec| @@ -13,7 +13,7 @@ end task :default => :rspec -Rake::RDocTask.new(:rdoc) do |rdoc| +RDoc::Task.new do |rdoc| version = File.exist?('VERSION') ? File.read('VERSION') : "" rdoc.rdoc_dir = 'rdoc' diff --git a/lib/anemone/page.rb b/lib/anemone/page.rb index b157ad63..9629d2b6 100644 --- a/lib/anemone/page.rb +++ b/lib/anemone/page.rb @@ -74,7 +74,14 @@ def links # def doc return @doc if @doc - @doc = Nokogiri::HTML(@body) if @body && html? rescue nil + if @body && html? + if charset == 'utf-8' || charset.nil? + body = @body + else + body = @body.encode("UTF-8", charset, :invalid => :replace, :undef => :replace) rescue nil + end + @doc = Nokogiri::HTML(body) if body + end end # @@ -107,6 +114,14 @@ def content_type headers['content-type'].first end + # + # The charset returned by the content-type request for this page + # + def charset + matcher = content_type.match(/charset=[\"]?([a-zA-Z\_\-\d]*)[\"]?/) + matcher[1].downcase if matcher + end + # # Returns +true+ if the page is a HTML document, returns +false+ # otherwise. From f7c1d4a0f059d356094449c0e82866b7290f6a9d Mon Sep 17 00:00:00 2001 From: takuro Date: Thu, 19 Jun 2014 07:15:40 +0900 Subject: [PATCH 2/4] editing mysql storage --- .gitignore | 1 + lib/anemone/storage.rb | 4 ++ lib/anemone/storage/mysql.rb | 91 ++++++++++++++++++++++++++++++++++++ lib/anemone/storage/s3.rb | 66 ++++++++++++++++++++++++++ 4 files changed, 162 insertions(+) create mode 100644 lib/anemone/storage/mysql.rb create mode 100644 lib/anemone/storage/s3.rb diff --git a/.gitignore b/.gitignore index 453303f4..2b3fb0ca 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ test.db test.tch *.kch rdoc +*.gem diff --git a/lib/anemone/storage.rb b/lib/anemone/storage.rb index 9c4ab096..0eba4bbc 100644 --- a/lib/anemone/storage.rb +++ b/lib/anemone/storage.rb @@ -40,5 +40,9 @@ def self.SQLite3(file = 'anemone.db') self::SQLite3.new(file) end + def self.SQLite3(opts = {}) + require 'anemone/storage/mysql' + self::MySQL.new(opts) + end end end diff --git a/lib/anemone/storage/mysql.rb b/lib/anemone/storage/mysql.rb new file mode 100644 index 00000000..c7f15af1 --- /dev/null +++ b/lib/anemone/storage/mysql.rb @@ -0,0 +1,91 @@ +begin + require 'mysql2' +rescue LoadError + puts "You need the mysql2 gem to use Anemone::Storage::MySQL" + exit +end + +module Anemone + module Storage + class MySQL + + def initialize(opts = {}) + @db = Mysql2::Client.new(:host => "localhost", :username => "crawler", :password => "anemone_pass", :database => "anemone") + create_schema + end + + def [](url) + value = @db.get_first_value('SELECT data FROM anemone_storage WHERE page_key = ?', url.to_s) + if value + Marshal.load(value) + end + end + + def []=(url, value) + data = Marshal.dump(value) + if has_key?(url) + @db.execute('UPDATE anemone_storage SET page_data = ? WHERE page_key = ?', data, url.to_s) + else + @db.execute('INSERT INTO anemone_storage (page_data, page_key) VALUES(?, ?)', data, url.to_s) + end + end + + def delete(url) + page = self[url] + @db.execute('DELETE FROM anemone_storage WHERE page_key = ?', url.to_s) + page + end + + def each + @db.execute("SELECT page_key, page_data FROM anemone_storage ORDER BY id") do |row| + value = Marshal.load(row[1]) + yield row[0], value + end + end + + def merge!(hash) + hash.each { |key, value| self[key] = value } + self + end + + def size + @db.get_first_value('SELECT COUNT(id) FROM anemone_storage') + end + + def keys + @db.execute("SELECT page_key FROM anemone_storage ORDER BY id").map{|t| t[0]} + end + + def has_key?(url) + !!@db.get_first_value('SELECT id FROM anemone_storage WHERE page_key = ?', url.to_s) + end + + def close + @db.close + end + + private + + def create_schema + @db.query < ENV['AWS_ACCESS_KEY']', + :secret_access_key => ENV['AWS_SECRET_ACCESS_KEY'] + ) + @s3 = AWS::S3.new + @bucket = @s3.buckets[bucket] + end + + def [](url) + @bucket.objects[url2hash(url)].read + end + + def []=(url, value) + object = @bucket.objects[url2hash(url)] + object.write(value) + end + + def delete(url) + @bucket.objects.delete(url2hash(url)) + end + + def each + #TODO + end + + def merge!(hash) + #TODO + end + + def size + #TODO + end + + def keys + #TODO + end + + def has_key?(url) + #TODO + end + + def close + #TODO + end + + private + + def url2hash(url) + Digest::SHA1.digest(url) + end + end + end +end + From 3b186d1c10eb7a128248a21bad53b25a36aa2e1a Mon Sep 17 00:00:00 2001 From: takuro Date: Fri, 20 Jun 2014 02:26:14 +0900 Subject: [PATCH 3/4] add mysql --- lib/anemone/storage.rb | 2 +- lib/anemone/storage/mysql.rb | 32 ++++++++++++++++++++++++-------- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/lib/anemone/storage.rb b/lib/anemone/storage.rb index 0eba4bbc..ce219e4f 100644 --- a/lib/anemone/storage.rb +++ b/lib/anemone/storage.rb @@ -40,7 +40,7 @@ def self.SQLite3(file = 'anemone.db') self::SQLite3.new(file) end - def self.SQLite3(opts = {}) + def self.MySQL(opts = {}) require 'anemone/storage/mysql' self::MySQL.new(opts) end diff --git a/lib/anemone/storage/mysql.rb b/lib/anemone/storage/mysql.rb index c7f15af1..8879d73c 100644 --- a/lib/anemone/storage/mysql.rb +++ b/lib/anemone/storage/mysql.rb @@ -1,3 +1,5 @@ +# coding: utf-8 + begin require 'mysql2' rescue LoadError @@ -10,12 +12,16 @@ module Storage class MySQL def initialize(opts = {}) - @db = Mysql2::Client.new(:host => "localhost", :username => "crawler", :password => "anemone_pass", :database => "anemone") + host = opts[:host] || 'localhost' + username = opts[:username] || 'crawler' + password = opts[:password] || 'anemone_pass' + database = opts[:database] || 'anemone' + @db = Mysql2::Client.new(:host => #{host}, :username => #{username}, :password => #{password}, :database => #{database}) create_schema end def [](url) - value = @db.get_first_value('SELECT data FROM anemone_storage WHERE page_key = ?', url.to_s) + value = @db.query("SELECT data FROM anemone_storage WHERE page_key = '#{get_hash_value(url)}'").first['data'] if value Marshal.load(value) end @@ -23,16 +29,17 @@ def [](url) def []=(url, value) data = Marshal.dump(value) + key = get_hash_value(url) if has_key?(url) - @db.execute('UPDATE anemone_storage SET page_data = ? WHERE page_key = ?', data, url.to_s) + @db.query("UPDATE anemone_storage SET page_data = '#{data}' WHERE page_key = '#{key}'") else - @db.execute('INSERT INTO anemone_storage (page_data, page_key) VALUES(?, ?)', data, url.to_s) + @db.query("INSERT INTO anemone_storage (page_key, page_data) VALUES('#{key}', '#{data}')") end end def delete(url) page = self[url] - @db.execute('DELETE FROM anemone_storage WHERE page_key = ?', url.to_s) + @db.query("DELETE FROM anemone_storage WHERE page_key = '#{get_hash_value(url)}'") page end @@ -49,15 +56,21 @@ def merge!(hash) end def size - @db.get_first_value('SELECT COUNT(id) FROM anemone_storage') + @db.query('SELECT COUNT(*) FROM anemone_storage') end def keys - @db.execute("SELECT page_key FROM anemone_storage ORDER BY id").map{|t| t[0]} + @db.query("SELECT page_key FROM anemone_storage ORDER BY id").map{|t| t[0]} end def has_key?(url) - !!@db.get_first_value('SELECT id FROM anemone_storage WHERE page_key = ?', url.to_s) + key = get_hash_value(url) + result = @db.query("SELECT count(id) FROM anemone_storage WHERE page_key = '#{key}'") + if result.first['count(id)'] > 0 + return true + else + return false + end end def close @@ -85,6 +98,9 @@ def load_page(hash) Page.from_hash(hash) end + def get_hash_value(key) + Digest::SHA1.hexdigest(key) + end end end end From f90969478f96340ce99328fc3649fb333cd24a8a Mon Sep 17 00:00:00 2001 From: takuros Date: Fri, 20 Jun 2014 02:37:28 +0900 Subject: [PATCH 4/4] spec --- spec/storage_mysql_spec.rb | 126 +++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 spec/storage_mysql_spec.rb diff --git a/spec/storage_mysql_spec.rb b/spec/storage_mysql_spec.rb new file mode 100644 index 00000000..5fd26bb2 --- /dev/null +++ b/spec/storage_mysql_spec.rb @@ -0,0 +1,126 @@ +$:.unshift(File.dirname(__FILE__)) +require 'spec_helper' + +require 'anemone/storage/mysql.rb + +module Anemone + describe Storage do + + describe ".MySQL" do + it "returns a MySQL adapter" do + store = Anemone::Storage.MySQL + store.should be_an_instance_of(Anemone::Storage::MySQL) + store.close + end + end + + module Storage + shared_examples_for "storage engine" do + + before(:each) do + @url = SPEC_DOMAIN + @page = Page.new(URI(@url)) + end + + it "should implement [] and []=" do + @store.should respond_to(:[]) + @store.should respond_to(:[]=) + + @store[@url] = @page + @store[@url].url.should == URI(@url) + end + + it "should implement has_key?" do + @store.should respond_to(:has_key?) + + @store[@url] = @page + @store.has_key?(@url).should == true + + @store.has_key?('missing').should == false + end + + it "should implement delete" do + @store.should respond_to(:delete) + + @store[@url] = @page + @store.delete(@url).url.should == @page.url + @store.has_key?(@url).should == false + end + + it "should implement keys" do + @store.should respond_to(:keys) + + urls = [SPEC_DOMAIN, SPEC_DOMAIN + 'test', SPEC_DOMAIN + 'another'] + pages = urls.map { |url| Page.new(URI(url)) } + urls.zip(pages).each { |arr| @store[arr[0]] = arr[1] } + + (@store.keys - urls).should == [] + end + + it "should implement each" do + @store.should respond_to(:each) + + urls = [SPEC_DOMAIN, SPEC_DOMAIN + 'test', SPEC_DOMAIN + 'another'] + pages = urls.map { |url| Page.new(URI(url)) } + urls.zip(pages).each { |arr| @store[arr[0]] = arr[1] } + + result = {} + @store.each { |k, v| result[k] = v } + (result.keys - urls).should == [] + (result.values.map { |page| page.url.to_s } - urls).should == [] + end + + it "should implement merge!, and return self" do + @store.should respond_to(:merge!) + + hash = {SPEC_DOMAIN => Page.new(URI(SPEC_DOMAIN)), + SPEC_DOMAIN + 'test' => Page.new(URI(SPEC_DOMAIN + 'test'))} + merged = @store.merge! hash + hash.each { |key, value| @store[key].url.to_s.should == key } + + merged.should === @store + end + + it "should correctly deserialize nil redirect_to when loading" do + @page.redirect_to.should be_nil + @store[@url] = @page + @store[@url].redirect_to.should be_nil + end + end + + describe PStore do + it_should_behave_like "storage engine" + + before(:each) do + @test_file = 'test.pstore' + File.delete @test_file rescue nil + @store = Anemone::Storage.PStore(@test_file) + end + + after(:all) do + File.delete @test_file rescue nil + end + end + + describe MySQL do + it_should_behave_like "storage engine" + + before(:each) do + @test_file = 'test.db' + File.delete @test_file rescue nil + @store = Anemone::Storage.SQLite3(@test_file) + end + + after(:each) do + @store.close + end + + after(:all) do + File.delete @test_file rescue nil + end + + end + + end + end +end