Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support other charset #77

Open
wants to merge 9 commits into
base: next
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 31 additions & 7 deletions lib/anemone/core.rb
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,11 @@ class Core
# proxy server port number
:proxy_port => false,
# HTTP read timeout in seconds
:read_timeout => nil
:read_timeout => nil,
#limit number of crawled pages queue
:pages_queue_limit => 1000,
#limit number of unique allowed links per crawl (TODO: move links queue to external storage)
:links_limit => 2000
}

# Create setter methods for all options to be called from the crawl block
Expand All @@ -79,6 +83,7 @@ def initialize(urls, opts = {})
@skip_link_patterns = []
@after_crawl_blocks = []
@opts = opts
@stop_crawl = false

yield self if block_given?
end
Expand Down Expand Up @@ -142,6 +147,17 @@ def focus_crawl(&block)
self
end

# Signals the crawler that it should stop the crawl before visiting the
# next page.
#
# This method is expected to be called within a page block, and it signals
# the crawler that it must stop after the current page is completely
# processed. All pages and links currently on queue are discard.
#
def stop_crawl
@stop_crawl = true
end

#
# Perform the crawl
#
Expand All @@ -152,7 +168,7 @@ def run
return if @urls.empty?

link_queue = Queue.new
page_queue = Queue.new
page_queue = SizedQueue.new(@opts[:pages_queue_limit])

@opts[:threads].times do
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
Expand All @@ -163,22 +179,30 @@ def run
loop do
page = page_queue.deq
@pages.touch_key page.url
puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
puts "#{page.url} Queue: #{link_queue.size} PageQueue #{page_queue.size}" if @opts[:verbose]
do_page_blocks page
page.discard_doc! if @opts[:discard_page_bodies]

links = links_to_follow page
links.each do |link|
link_queue << [link, page.url.dup, page.depth + 1]
if link_queue.size < @opts[:links_limit] and !@stop_crawl
links = links_to_follow page
puts "links: #{links.count}" if @opts[:verbose]
links.each do |link|
link_queue << [link, page.url.dup, page.depth + 1]
end
@pages.touch_keys links
end
@pages.touch_keys links

@pages[page.url] = page

if @stop_crawl
link_queue.clear
end

# if we are done with the crawl, tell the threads to end
if link_queue.empty? and page_queue.empty?
until link_queue.num_waiting == @tentacles.size
Thread.pass
break unless page_queue.empty? #page queue could be filled again by waiting threads
end
if page_queue.empty?
@tentacles.size.times { link_queue << :END }
Expand Down
10 changes: 9 additions & 1 deletion lib/anemone/page.rb
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,10 @@ def links
#
def doc
return @doc if @doc
@doc = Nokogiri::HTML(@body) if @body && html? rescue nil
if @body && html?
@body = @body.force_encoding(charset).encode('utf-8') unless (charset == 'utf-8' || charset.nil?) rescue nil
@doc = Nokogiri::HTML(@body)
end
end

#
Expand Down Expand Up @@ -107,6 +110,11 @@ def content_type
headers['content-type'].first
end

def charset
matcher = content_type.match(/charset=[\"]?([a-zA-Z\-\d]*)[\"]?/)
matcher[1].downcase if matcher
end

#
# Returns +true+ if the page is a HTML document, returns +false+
# otherwise.
Expand Down
5 changes: 5 additions & 0 deletions lib/anemone/storage.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ def self.MongoDB(mongo_db = nil, collection_name = 'pages')
self::MongoDB.new(mongo_db, collection_name)
end

def self.Mongoid(model_name = 'anemone_page')
require 'anemone/storage/mongoid'
self::Mongoid.new(model_name)
end

def self.Redis(opts = {})
require 'anemone/storage/redis'
self::Redis.new(opts)
Expand Down
80 changes: 80 additions & 0 deletions lib/anemone/storage/mongoid.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
begin
require 'mongoid'
rescue LoadError
puts "You need the mongoid gem to use Anemone::Storage::Mongoid"
exit
end

module Anemone
module Storage
class Mongoid

BINARY_FIELDS = %w(body headers data)

def initialize(model_name)
@model = model_name.is_a?(String) ? model_name.classify.constantize : model_name
@model.destroy_all
@model.create_indexes #'url'
end

def [](url)
if value = @model.where(:url => url.to_s).first
load_page(value)
end
end

def []=(url, page)
hash = page.to_hash
BINARY_FIELDS.each do |field|
hash[field] = Moped::BSON::Binary.new(:generic, hash[field]) unless hash[field].nil?
end
@model.find_or_create_by(:url => page.url.to_s).update_attributes(hash)
end

def delete(url)
page = self[url]
@model.destroy(:url => url.to_s)
page
end

def each
@model.each do |doc|
page = load_page(doc)
yield page.url.to_s, page
end
end

def merge!(hash)
hash.each { |key, value| self[key] = value }
self
end

def size
@model.count
end

def keys
keys = []
self.each { |k, v| keys << k.to_s }
keys
end

def has_key?(url)
[email protected](:url => url.to_s).first
end

def close
end

private

def load_page(doc)
BINARY_FIELDS.each do |field|
doc[field] = doc[field].to_s
end
Page.from_hash(doc)
end

end
end
end