Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support other charsets #85

Open
wants to merge 4 commits into
base: next
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ test.db
test.tch
*.kch
rdoc
*.gem
6 changes: 6 additions & 0 deletions CHANGELOG.rdoc
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
== 0.7.2-fork / 2014-04-05

* Minor enhancements

* Support other charsets

== 0.7.2 / 2012-05-30

* Bug fixes
Expand Down
4 changes: 2 additions & 2 deletions Rakefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
require 'rspec/core/rake_task'
require 'rake/rdoctask'
require 'rdoc/task'

desc "Run all specs"
RSpec::Core::RakeTask.new(:rspec) do |spec|
Expand All @@ -13,7 +13,7 @@ end

task :default => :rspec

Rake::RDocTask.new(:rdoc) do |rdoc|
RDoc::Task.new do |rdoc|
version = File.exist?('VERSION') ? File.read('VERSION') : ""

rdoc.rdoc_dir = 'rdoc'
Expand Down
17 changes: 16 additions & 1 deletion lib/anemone/page.rb
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,14 @@ def links
#
def doc
return @doc if @doc
@doc = Nokogiri::HTML(@body) if @body && html? rescue nil
if @body && html?
if charset == 'utf-8' || charset.nil?
body = @body
else
body = @body.encode("UTF-8", charset, :invalid => :replace, :undef => :replace) rescue nil
end
@doc = Nokogiri::HTML(body) if body
end
end

#
Expand Down Expand Up @@ -107,6 +114,14 @@ def content_type
headers['content-type'].first
end

#
# The charset returned by the content-type request for this page
#
def charset
matcher = content_type.match(/charset=[\"]?([a-zA-Z\_\-\d]*)[\"]?/)
matcher[1].downcase if matcher
end

#
# Returns +true+ if the page is a HTML document, returns +false+
# otherwise.
Expand Down
4 changes: 4 additions & 0 deletions lib/anemone/storage.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,9 @@ def self.SQLite3(file = 'anemone.db')
self::SQLite3.new(file)
end

def self.MySQL(opts = {})
require 'anemone/storage/mysql'
self::MySQL.new(opts)
end
end
end
107 changes: 107 additions & 0 deletions lib/anemone/storage/mysql.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# coding: utf-8

begin
require 'mysql2'
rescue LoadError
puts "You need the mysql2 gem to use Anemone::Storage::MySQL"
exit
end

module Anemone
module Storage
class MySQL

def initialize(opts = {})
host = opts[:host] || 'localhost'
username = opts[:username] || 'crawler'
password = opts[:password] || 'anemone_pass'
database = opts[:database] || 'anemone'
@db = Mysql2::Client.new(:host => #{host}, :username => #{username}, :password => #{password}, :database => #{database})
create_schema
end

def [](url)
value = @db.query("SELECT data FROM anemone_storage WHERE page_key = '#{get_hash_value(url)}'").first['data']
if value
Marshal.load(value)
end
end

def []=(url, value)
data = Marshal.dump(value)
key = get_hash_value(url)
if has_key?(url)
@db.query("UPDATE anemone_storage SET page_data = '#{data}' WHERE page_key = '#{key}'")
else
@db.query("INSERT INTO anemone_storage (page_key, page_data) VALUES('#{key}', '#{data}')")
end
end

def delete(url)
page = self[url]
@db.query("DELETE FROM anemone_storage WHERE page_key = '#{get_hash_value(url)}'")
page
end

def each
@db.execute("SELECT page_key, page_data FROM anemone_storage ORDER BY id") do |row|
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i cant find db.execute method

value = Marshal.load(row[1])
yield row[0], value
end
end

def merge!(hash)
hash.each { |key, value| self[key] = value }
self
end

def size
@db.query('SELECT COUNT(*) FROM anemone_storage')
end

def keys
@db.query("SELECT page_key FROM anemone_storage ORDER BY id").map{|t| t[0]}
end

def has_key?(url)
key = get_hash_value(url)
result = @db.query("SELECT count(id) FROM anemone_storage WHERE page_key = '#{key}'")
if result.first['count(id)'] > 0
return true
else
return false
end
end

def close
@db.close
end

private

def create_schema
@db.query <<SQL
create table if not exists anemone_storage (
id int(11) NOT NULL auto_increment,
page_key varchar(255),
page_data BLOB,
PRIMARY KEY (id),
key (page_key)
) DEFAULT CHARSET=utf8;
SQL
end

def load_page(hash)
BINARY_FIELDS.each do |field|
hash[field] = hash[field].to_s
end
Page.from_hash(hash)
end

def get_hash_value(key)
Digest::SHA1.hexdigest(key)
end
end
end
end

66 changes: 66 additions & 0 deletions lib/anemone/storage/s3.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
begin
require 'aws-sdk'
rescue LoadError
puts "You need the sqlite3 gem to use Anemone::Storage::SQLite3"
exit
end

module Anemone
module Storage
class S3

def initialize(bucket,key,secret)
AWS.config(
:access_key_id => ENV['AWS_ACCESS_KEY']',
:secret_access_key => ENV['AWS_SECRET_ACCESS_KEY']
)
@s3 = AWS::S3.new
@bucket = @s3.buckets[bucket]
end

def [](url)
@bucket.objects[url2hash(url)].read
end

def []=(url, value)
object = @bucket.objects[url2hash(url)]
object.write(value)
end

def delete(url)
@bucket.objects.delete(url2hash(url))
end

def each
#TODO
end

def merge!(hash)
#TODO
end

def size
#TODO
end

def keys
#TODO
end

def has_key?(url)
#TODO
end

def close
#TODO
end

private

def url2hash(url)
Digest::SHA1.digest(url)
end
end
end
end

126 changes: 126 additions & 0 deletions spec/storage_mysql_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
$:.unshift(File.dirname(__FILE__))
require 'spec_helper'

require 'anemone/storage/mysql.rb

module Anemone
describe Storage do

describe ".MySQL" do
it "returns a MySQL adapter" do
store = Anemone::Storage.MySQL
store.should be_an_instance_of(Anemone::Storage::MySQL)
store.close
end
end

module Storage
shared_examples_for "storage engine" do

before(:each) do
@url = SPEC_DOMAIN
@page = Page.new(URI(@url))
end

it "should implement [] and []=" do
@store.should respond_to(:[])
@store.should respond_to(:[]=)

@store[@url] = @page
@store[@url].url.should == URI(@url)
end

it "should implement has_key?" do
@store.should respond_to(:has_key?)

@store[@url] = @page
@store.has_key?(@url).should == true

@store.has_key?('missing').should == false
end

it "should implement delete" do
@store.should respond_to(:delete)

@store[@url] = @page
@store.delete(@url).url.should == @page.url
@store.has_key?(@url).should == false
end

it "should implement keys" do
@store.should respond_to(:keys)

urls = [SPEC_DOMAIN, SPEC_DOMAIN + 'test', SPEC_DOMAIN + 'another']
pages = urls.map { |url| Page.new(URI(url)) }
urls.zip(pages).each { |arr| @store[arr[0]] = arr[1] }

(@store.keys - urls).should == []
end

it "should implement each" do
@store.should respond_to(:each)

urls = [SPEC_DOMAIN, SPEC_DOMAIN + 'test', SPEC_DOMAIN + 'another']
pages = urls.map { |url| Page.new(URI(url)) }
urls.zip(pages).each { |arr| @store[arr[0]] = arr[1] }

result = {}
@store.each { |k, v| result[k] = v }
(result.keys - urls).should == []
(result.values.map { |page| page.url.to_s } - urls).should == []
end

it "should implement merge!, and return self" do
@store.should respond_to(:merge!)

hash = {SPEC_DOMAIN => Page.new(URI(SPEC_DOMAIN)),
SPEC_DOMAIN + 'test' => Page.new(URI(SPEC_DOMAIN + 'test'))}
merged = @store.merge! hash
hash.each { |key, value| @store[key].url.to_s.should == key }

merged.should === @store
end

it "should correctly deserialize nil redirect_to when loading" do
@page.redirect_to.should be_nil
@store[@url] = @page
@store[@url].redirect_to.should be_nil
end
end

describe PStore do
it_should_behave_like "storage engine"

before(:each) do
@test_file = 'test.pstore'
File.delete @test_file rescue nil
@store = Anemone::Storage.PStore(@test_file)
end

after(:all) do
File.delete @test_file rescue nil
end
end

describe MySQL do
it_should_behave_like "storage engine"

before(:each) do
@test_file = 'test.db'
File.delete @test_file rescue nil
@store = Anemone::Storage.SQLite3(@test_file)
end

after(:each) do
@store.close
end

after(:all) do
File.delete @test_file rescue nil
end

end

end
end
end