postmodern · buren · Aug 25, 2018 · Aug 25, 2018 · postmodern · Jan 28, 2022
diff --git a/lib/spidr/agent.rb b/lib/spidr/agent.rb
@@ -4,6 +4,7 @@
 require 'spidr/agent/events'
 require 'spidr/agent/actions'
 require 'spidr/agent/robots'
+require 'spidr/agent/sitemap'
 require 'spidr/page'
 require 'spidr/session_cache'
 require 'spidr/cookie_jar'
@@ -222,6 +223,10 @@ def initialize(options={})
         initialize_robots
       end
 
+      if options.fetch(:sitemap,false)
+        initialize_sitemap
+      end
+
       yield self if block_given?
     end
 
@@ -351,6 +356,8 @@ def clear
     #   A page which has been visited.
     #
     def start_at(url,&block)
+      sitemap_urls(url).each { |u| enqueue(u) }
+
       enqueue(url)
       return run(&block)
     end

diff --git a/lib/spidr/agent/sitemap.rb b/lib/spidr/agent/sitemap.rb
@@ -0,0 +1,69 @@
+require 'set'
+
+module Spidr
+  class Agent
+    # Common locations for Sitemap(s)
+    COMMON_SITEMAP_LOCATIONS = %w[
+      sitemap.xml
+      sitemap.xml.gz
+      sitemap.gz
+      sitemap_index.xml
+      sitemap-index.xml
+      sitemap_index.xml.gz
+      sitemap-index.xml.gz
+    ].freeze
+
+    #
+    # Initializes the sitemap fetcher.
+    #
+    def initialize_sitemap
+      @sitemap = true
+    end
+
+    #
+    # Returns the URLs found as per the sitemap.xml spec.
+    #
+    # @return [Array<URI::HTTP>, Array<URI::HTTPS>]
+    #   The URLs found.
+    #
+    # @see https://www.sitemaps.org/protocol.html
+    def sitemap_urls(url)
+      return [] unless @sitemap
+      base_url = to_base_url(url)
+
+      if @robots
+        if urls = @robots.other_values(base_url)['Sitemap']
+          return urls.flat_map { |u| get_sitemap_urls(url: u) }
+        end
+      end
+
+      COMMON_SITEMAP_LOCATIONS.each do |path|
+        if (page = get_page("#{base_url}/#{path}")).code == 200
+          return get_sitemap_urls(page: page)
+        end
+      end
+
+      []
+    end
+
+    private
+
+    def get_sitemap_urls(url: nil, page: nil)
+      page = get_page(url) if page.nil?
+      return [] unless page
+
+      if page.sitemap_index?
+        page.each_sitemap_index_url.flat_map { |u| get_sitemap_urls(url: u) }
+      else
+        page.sitemap_urls
+      end
+    end
+
+    def to_base_url(url)
+      uri = url
+      uri = URI.parse(url) unless url.is_a?(URI)
+
+      "#{uri.scheme}://#{uri.host}"
+    end
+  end
+end
diff --git a/lib/spidr/page.rb b/lib/spidr/page.rb
@@ -142,11 +142,12 @@ def method_missing(name,*arguments,&block)
 
       return super(name,*arguments,&block)
     end
-  
+
   end
 end
 
 require 'spidr/page/status_codes'
 require 'spidr/page/content_types'
 require 'spidr/page/cookies'
 require 'spidr/page/html'
+require 'spidr/page/sitemap'
diff --git a/lib/spidr/page/content_types.rb b/lib/spidr/page/content_types.rb
@@ -221,5 +221,15 @@ def pdf?
     def zip?
       is_content_type?('application/zip')
     end
+
+    #
+    # Determines if the page is a Gzip archive.
+    #
+    # @return [Boolean]
+    #   Specifies whether the page is a Gzip archive.
+    #
+    def gzip?
+      is_content_type?('application/gzip')
+    end
   end
 end
diff --git a/lib/spidr/page/sitemap.rb b/lib/spidr/page/sitemap.rb
@@ -0,0 +1,194 @@
+require 'nokogiri'
+require 'zlib'
+
+module Spidr
+  class Page
+    include Enumerable
+
+    #
+    # Enumerates over the links in the sitemap page.
+    #
+    # @yield [link]
+    #   If a block is given, it will be passed every link in the
+    #   sitemap page.
+    #
+    # @yieldparam [String] link
+    #   A URL from the sitemap page.
+    #
+    # @return [Enumerator]
+    #   If no block is given, an enumerator object will be returned.
+    def each_sitemap_link
+      return enum_for(__method__) unless block_given?
+
+      each_extracted_sitemap_links('url') { |url| yield(url) }
+    end
+
+    #
+    # Return all links defined in Sitemap.
+    #
+    # @return [Array<String>]
+    #   of links defined in Sitemap.
+    def sitemap_links
+      each_sitemap_link.to_a
+    end
+
+    #
+    # Enumerates over the Sitemap index links in the sitemap page.
+    #
+    # @yield [link]
+    #   If a block is given, it will be passed every link in the
+    #   sitemap page.
+    #
+    # @yieldparam [String] link
+    #   A URL from the sitemap page.
+    #
+    # @return [Enumerator]
+    #   If no block is given, an enumerator object will be returned.
+    def each_sitemap_index_link
+      return enum_for(__method__) unless block_given?
+
+      each_extracted_sitemap_links('sitemap') { |url| yield(url) }
+    end
+
+    #
+    # Return all Sitemap index links defined in sitemap.
+    #
+    # @return [Array<String>]
+    #   of links defined in Sitemap.
+    def sitemap_index_links
+      each_sitemap_index_link.to_a
+    end
+
+    #
+    # Enumerates over the URLs in the sitemap page.
+    #
+    # @yield [url]
+    #   If a block is given, it will be passed every URL in the
+    #   sitemap page.
+    #
+    # @yieldparam [URI::HTTP, URI::HTTPS] url
+    #   A URL from the sitemap page.
+    #
+    # @return [Enumerator]
+    #   If no block is given, an enumerator object will be returned.
+    def each_sitemap_url
+      return enum_for(__method__) unless block_given?
+
+      each_sitemap_link do |link|
+        if (url = to_absolute(link))
+          yield url
+        end
+      end
+    end
+
+    #
+    # Return all URLs defined in Sitemap.
+    #
+    # @return [Array<URI::HTTP>, Array<URI::HTTPS>]
+    #   of URLs defined in Sitemap.
+    def sitemap_urls
+      each_sitemap_url.to_a
+    end
+
+    #
+    # Enumerates over the sitemap URLs in the sitemap page.
+    #
+    # @yield [url]
+    #   If a block is given, it will be passed every sitemap URL in the
+    #   sitemap page.
+    #
+    # @yieldparam [URI::HTTP, URI::HTTPS] url
+    #   A sitemap URL from the sitemap page.
+    #
+    # @return [Enumerator]
+    #   If no block is given, an enumerator object will be returned.
+    def each_sitemap_index_url
+      return enum_for(__method__) unless block_given?
+
+      each_sitemap_index_link do |link|
+        if (url = to_absolute(link))
+          yield url
+        end
+      end
+    end
+
+    #
+    # Return all sitemap index URLs defined in Sitemap.
+    #
+    # @return [Array<URI::HTTP>, Array<URI::HTTPS>]
+    #   Sitemap index URLs defined in Sitemap.
+    def sitemap_index_urls
+      each_sitemap_index_url.to_a
+    end
+
+    #
+    # Returns true if Sitemap is a Sitemap index.
+    #
+    # @return [Boolean]
+    def sitemap_index?
+      sitemap_root_name == 'sitemapindex'
+    end
+
+    #
+    # Returns true if Sitemap is a regular list of URLs.
+    #
+    # @return [Boolean]
+    def sitemap_urlset?
+      sitemap_root_name == 'urlset'
+    end
+
+    #
+    # Returns the document for the sitemap, if the content type is gzip it
+    # will be uncompressed.
+    #
+    # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
+    #   The document that represents sitemap XML pages.
+    #   Returns `nil` if the page is neither XML, gzipped XML or if
+    #   the page could not be parsed properly.
+    #
+    # @see #doc
+    #
+    def sitemap_doc
+      return doc if doc && !gzip?
+
+      begin
+        @sitemap_doc ||= Nokogiri::XML::Document.parse(unzipped_body, @url.to_s, content_charset)
+      rescue
+      end
+    end
+
+    private
+
+    def sitemap_root_name
+      return unless doc.root
+
+      doc.root.name
+    end
+
+    def each_extracted_sitemap_links(node_name)
+      if plain_text?
+        return unzipped_body.each_line { |url| yield(url.strip) }
+      end
+
+      return unless sitemap_doc
+
+      sitemap_doc.css("#{node_name} loc").each do |element|
+        yield(element.text)
+      end
+    end
+
+    def unzipped_body
+      return body unless gzip?
+
+      io = StringIO.new(body)
+      gz = Zlib::GzipReader.new(io)
+      body = gz.read
+    rescue Zlib::Error
+      ''
+    ensure
+      gz.close if gz
+
+      body
+    end
+  end
+end
diff --git a/spec/agent/sitemap_spec.rb b/spec/agent/sitemap_spec.rb
@@ -0,0 +1,58 @@
+require 'spec_helper'
+require 'example_app'
+
+require 'spidr/agent'
+
+describe Agent do
+  describe "sitemap" do
+    context "from common sitemap index path" do
+      include_context "example App"
+
+      subject { described_class.new(host: host, sitemap: true) }
+
+      app do
+        before do
+          content_type 'application/xml'
+        end
+
+        get '/sitemap-index.xml' do
+          <<-SITEMAP_XML
+            <?xml version="1.0" encoding="UTF-8"?>
+            <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+               <sitemap>
+                  <loc>http://example.com/my-sitemap.xml</loc>
+               </sitemap>
+            </sitemapindex>
+          SITEMAP_XML
+        end
+
+        get '/my-sitemap.xml' do
+          <<-SITEMAP_XML
+          <?xml version="1.0" encoding="UTF-8"?>
+          <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+            <url>
+               <loc>http://example.com/</loc>
+            </url>
+             <url>
+                <loc>http://example.com/some-path</loc>
+             </url>
+          </urlset>
+          SITEMAP_XML
+        end
+      end
+
+      before do
+        stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
+      end
+
+      it 'should fetch all URLs in sitemap' do
+        urls = subject.sitemap_urls('http://example.com')
+        expected = [
+          URI('http://example.com/'),
+          URI('http://example.com/some-path')
+        ]
+        expect(urls).to eq(expected)
+      end
+    end
+  end
+end
diff --git a/spec/page/content_types_spec.rb b/spec/page/content_types_spec.rb
@@ -147,4 +147,8 @@
   describe "#zip?" do
     include_examples "Content-Type method", :zip?, 'application/zip'
   end
+
+  describe "#gzip?" do
+    include_examples "Content-Type method", :gzip?, 'application/gzip'
+  end
 end