diff --git a/lib/anemone/core.rb b/lib/anemone/core.rb index d1629a49..8178bb80 100644 --- a/lib/anemone/core.rb +++ b/lib/anemone/core.rb @@ -55,7 +55,11 @@ class Core # proxy server port number :proxy_port => false, # HTTP read timeout in seconds - :read_timeout => nil + :read_timeout => nil, + #limit number of crawled pages queue + :pages_queue_limit => 1000, + #limit number of unique allowed links per crawl (TODO: move links queue to external storage) + :links_limit => 2000 } # Create setter methods for all options to be called from the crawl block @@ -79,6 +83,7 @@ def initialize(urls, opts = {}) @skip_link_patterns = [] @after_crawl_blocks = [] @opts = opts + @stop_crawl = false yield self if block_given? end @@ -142,6 +147,17 @@ def focus_crawl(&block) self end + # Signals the crawler that it should stop the crawl before visiting the + # next page. + # + # This method is expected to be called within a page block, and it signals + # the crawler that it must stop after the current page is completely + # processed. All pages and links currently on queue are discard. + # + def stop_crawl + @stop_crawl = true + end + # # Perform the crawl # @@ -152,7 +168,7 @@ def run return if @urls.empty? link_queue = Queue.new - page_queue = Queue.new + page_queue = SizedQueue.new(@opts[:pages_queue_limit]) @opts[:threads].times do @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run } @@ -163,22 +179,30 @@ def run loop do page = page_queue.deq @pages.touch_key page.url - puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose] + puts "#{page.url} Queue: #{link_queue.size} PageQueue #{page_queue.size}" if @opts[:verbose] do_page_blocks page page.discard_doc! if @opts[:discard_page_bodies] - links = links_to_follow page - links.each do |link| - link_queue << [link, page.url.dup, page.depth + 1] + if link_queue.size < @opts[:links_limit] and !@stop_crawl + links = links_to_follow page + puts "links: #{links.count}" if @opts[:verbose] + links.each do |link| + link_queue << [link, page.url.dup, page.depth + 1] + end + @pages.touch_keys links end - @pages.touch_keys links @pages[page.url] = page + if @stop_crawl + link_queue.clear + end + # if we are done with the crawl, tell the threads to end if link_queue.empty? and page_queue.empty? until link_queue.num_waiting == @tentacles.size Thread.pass + break unless page_queue.empty? #page queue could be filled again by waiting threads end if page_queue.empty? @tentacles.size.times { link_queue << :END } diff --git a/lib/anemone/page.rb b/lib/anemone/page.rb index b157ad63..8a2f3f51 100644 --- a/lib/anemone/page.rb +++ b/lib/anemone/page.rb @@ -74,7 +74,10 @@ def links # def doc return @doc if @doc - @doc = Nokogiri::HTML(@body) if @body && html? rescue nil + if @body && html? + @body = @body.force_encoding(charset).encode('utf-8') unless (charset == 'utf-8' || charset.nil?) rescue nil + @doc = Nokogiri::HTML(@body) + end end # @@ -107,6 +110,11 @@ def content_type headers['content-type'].first end + def charset + matcher = content_type.match(/charset=[\"]?([a-zA-Z\-\d]*)[\"]?/) + matcher[1].downcase if matcher + end + # # Returns +true+ if the page is a HTML document, returns +false+ # otherwise. diff --git a/lib/anemone/storage.rb b/lib/anemone/storage.rb index 9c4ab096..4b26f949 100644 --- a/lib/anemone/storage.rb +++ b/lib/anemone/storage.rb @@ -30,6 +30,11 @@ def self.MongoDB(mongo_db = nil, collection_name = 'pages') self::MongoDB.new(mongo_db, collection_name) end + def self.Mongoid(model_name = 'anemone_page') + require 'anemone/storage/mongoid' + self::Mongoid.new(model_name) + end + def self.Redis(opts = {}) require 'anemone/storage/redis' self::Redis.new(opts) diff --git a/lib/anemone/storage/mongoid.rb b/lib/anemone/storage/mongoid.rb new file mode 100644 index 00000000..69f00cd7 --- /dev/null +++ b/lib/anemone/storage/mongoid.rb @@ -0,0 +1,80 @@ +begin + require 'mongoid' +rescue LoadError + puts "You need the mongoid gem to use Anemone::Storage::Mongoid" + exit +end + +module Anemone + module Storage + class Mongoid + + BINARY_FIELDS = %w(body headers data) + + def initialize(model_name) + @model = model_name.is_a?(String) ? model_name.classify.constantize : model_name + @model.destroy_all + @model.create_indexes #'url' + end + + def [](url) + if value = @model.where(:url => url.to_s).first + load_page(value) + end + end + + def []=(url, page) + hash = page.to_hash + BINARY_FIELDS.each do |field| + hash[field] = Moped::BSON::Binary.new(:generic, hash[field]) unless hash[field].nil? + end + @model.find_or_create_by(:url => page.url.to_s).update_attributes(hash) + end + + def delete(url) + page = self[url] + @model.destroy(:url => url.to_s) + page + end + + def each + @model.each do |doc| + page = load_page(doc) + yield page.url.to_s, page + end + end + + def merge!(hash) + hash.each { |key, value| self[key] = value } + self + end + + def size + @model.count + end + + def keys + keys = [] + self.each { |k, v| keys << k.to_s } + keys + end + + def has_key?(url) + !!@model.where(:url => url.to_s).first + end + + def close + end + + private + + def load_page(doc) + BINARY_FIELDS.each do |field| + doc[field] = doc[field].to_s + end + Page.from_hash(doc) + end + + end + end +end