chriskite · totothink · May 23, 2013 · May 23, 2013 · May 23, 2013 · May 24, 2013
diff --git a/lib/anemone/core.rb b/lib/anemone/core.rb
@@ -55,7 +55,11 @@ class Core
       # proxy server port number
       :proxy_port => false,
       # HTTP read timeout in seconds
-      :read_timeout => nil
+      :read_timeout => nil,
+      #limit number of crawled pages queue
+      :pages_queue_limit => 1000,
+      #limit number of unique allowed links per crawl (TODO: move links queue to external storage)
+      :links_limit => 2000
     }
 
     # Create setter methods for all options to be called from the crawl block
@@ -79,6 +83,7 @@ def initialize(urls, opts = {})
       @skip_link_patterns = []
       @after_crawl_blocks = []
       @opts = opts
+      @stop_crawl = false
 
       yield self if block_given?
     end
@@ -142,6 +147,17 @@ def focus_crawl(&block)
       self
     end
 
+    # Signals the crawler that it should stop the crawl before visiting the
+    # next page.
+    #
+    # This method is expected to be called within a page block, and it signals
+    # the crawler that it must stop after the current page is completely
+    # processed.  All pages and links currently on queue are discard.
+    #
+    def stop_crawl
+      @stop_crawl = true
+    end
+
     #
     # Perform the crawl
     #
@@ -152,7 +168,7 @@ def run
       return if @urls.empty?
 
       link_queue = Queue.new
-      page_queue = Queue.new
+      page_queue = SizedQueue.new(@opts[:pages_queue_limit])
 
       @opts[:threads].times do
         @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
@@ -163,22 +179,30 @@ def run
       loop do
         page = page_queue.deq
         @pages.touch_key page.url
-        puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
+        puts "#{page.url} Queue: #{link_queue.size} PageQueue #{page_queue.size}" if @opts[:verbose]
         do_page_blocks page
         page.discard_doc! if @opts[:discard_page_bodies]
 
-        links = links_to_follow page
-        links.each do |link|
-          link_queue << [link, page.url.dup, page.depth + 1]
+        if link_queue.size < @opts[:links_limit] and !@stop_crawl
+          links = links_to_follow page
+          puts "links: #{links.count}" if @opts[:verbose]
+          links.each do |link|
+            link_queue << [link, page.url.dup, page.depth + 1]
+          end
+          @pages.touch_keys links
         end
-        @pages.touch_keys links
 
         @pages[page.url] = page
 
+        if @stop_crawl
+          link_queue.clear
+        end
+
         # if we are done with the crawl, tell the threads to end
         if link_queue.empty? and page_queue.empty?
           until link_queue.num_waiting == @tentacles.size
             Thread.pass
+            break unless page_queue.empty? #page queue could be filled again by waiting threads 
           end
           if page_queue.empty?
             @tentacles.size.times { link_queue << :END }

diff --git a/lib/anemone/page.rb b/lib/anemone/page.rb
@@ -74,7 +74,10 @@ def links
     #
     def doc
       return @doc if @doc
-      @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
+      if @body && html?
+        @body = @body.force_encoding(charset).encode('utf-8') unless (charset == 'utf-8' || charset.nil?) rescue nil
+        @doc = Nokogiri::HTML(@body)
+      end
     end
 
     #
@@ -107,6 +110,11 @@ def content_type
       headers['content-type'].first
     end
 
+    def charset
+      matcher = content_type.match(/charset=[\"]?([a-zA-Z\-\d]*)[\"]?/)
+      matcher[1].downcase if matcher
+    end    
+
     #
     # Returns +true+ if the page is a HTML document, returns +false+
     # otherwise.

diff --git a/lib/anemone/storage.rb b/lib/anemone/storage.rb
@@ -30,6 +30,11 @@ def self.MongoDB(mongo_db = nil, collection_name = 'pages')
       self::MongoDB.new(mongo_db, collection_name)
     end
 
+    def self.Mongoid(model_name = 'anemone_page')
+      require 'anemone/storage/mongoid'
+      self::Mongoid.new(model_name)
+    end
+
     def self.Redis(opts = {})
       require 'anemone/storage/redis'
       self::Redis.new(opts)

diff --git a/lib/anemone/storage/mongoid.rb b/lib/anemone/storage/mongoid.rb
@@ -0,0 +1,80 @@
+begin
+  require 'mongoid'
+rescue LoadError
+  puts "You need the mongoid gem to use Anemone::Storage::Mongoid"
+  exit
+end
+
+module Anemone
+  module Storage
+    class Mongoid 
+
+      BINARY_FIELDS = %w(body headers data)
+
+      def initialize(model_name)
+        @model = model_name.is_a?(String) ? model_name.classify.constantize : model_name
+        @model.destroy_all
+        @model.create_indexes #'url'
+      end
+
+      def [](url)
+        if value = @model.where(:url => url.to_s).first
+          load_page(value)
+        end
+      end
+
+      def []=(url, page)
+        hash = page.to_hash
+        BINARY_FIELDS.each do |field|
+          hash[field] = Moped::BSON::Binary.new(:generic, hash[field]) unless hash[field].nil?
+        end
+        @model.find_or_create_by(:url => page.url.to_s).update_attributes(hash)
+      end
+
+      def delete(url)
+        page = self[url]
+        @model.destroy(:url => url.to_s)
+        page
+      end
+
+      def each
+        @model.each do |doc|
+          page = load_page(doc)
+          yield page.url.to_s, page 
+        end
+      end
+
+      def merge!(hash)
+        hash.each { |key, value| self[key] = value }
+        self
+      end
+
+      def size
+        @model.count
+      end
+
+      def keys
+        keys = []
+        self.each { |k, v| keys << k.to_s }
+        keys
+      end
+
+      def has_key?(url)
+        [email protected](:url => url.to_s).first
+      end
+
+      def close
+      end
+
+      private
+
+      def load_page(doc)
+        BINARY_FIELDS.each do |field|
+          doc[field] = doc[field].to_s
+        end
+        Page.from_hash(doc)
+      end
+
+    end
+  end
+end