diff --git a/lib/spidr/agent.rb b/lib/spidr/agent.rb index cb55b0bd..87e766dc 100644 --- a/lib/spidr/agent.rb +++ b/lib/spidr/agent.rb @@ -4,6 +4,7 @@ require 'spidr/agent/events' require 'spidr/agent/actions' require 'spidr/agent/robots' +require 'spidr/agent/sitemap' require 'spidr/page' require 'spidr/session_cache' require 'spidr/cookie_jar' @@ -222,6 +223,10 @@ def initialize(options={}) initialize_robots end + if options.fetch(:sitemap,false) + initialize_sitemap + end + yield self if block_given? end @@ -351,6 +356,8 @@ def clear # A page which has been visited. # def start_at(url,&block) + sitemap_urls(url).each { |u| enqueue(u) } + enqueue(url) return run(&block) end diff --git a/lib/spidr/agent/sitemap.rb b/lib/spidr/agent/sitemap.rb new file mode 100644 index 00000000..4bcbab74 --- /dev/null +++ b/lib/spidr/agent/sitemap.rb @@ -0,0 +1,69 @@ +require 'set' + +module Spidr + class Agent + # Common locations for Sitemap(s) + COMMON_SITEMAP_LOCATIONS = %w[ + sitemap.xml + sitemap.xml.gz + sitemap.gz + sitemap_index.xml + sitemap-index.xml + sitemap_index.xml.gz + sitemap-index.xml.gz + ].freeze + + # + # Initializes the sitemap fetcher. + # + def initialize_sitemap + @sitemap = true + end + + # + # Returns the URLs found as per the sitemap.xml spec. + # + # @return [Array, Array] + # The URLs found. + # + # @see https://www.sitemaps.org/protocol.html + def sitemap_urls(url) + return [] unless @sitemap + base_url = to_base_url(url) + + if @robots + if urls = @robots.other_values(base_url)['Sitemap'] + return urls.flat_map { |u| get_sitemap_urls(url: u) } + end + end + + COMMON_SITEMAP_LOCATIONS.each do |path| + if (page = get_page("#{base_url}/#{path}")).code == 200 + return get_sitemap_urls(page: page) + end + end + + [] + end + + private + + def get_sitemap_urls(url: nil, page: nil) + page = get_page(url) if page.nil? + return [] unless page + + if page.sitemap_index? + page.each_sitemap_index_url.flat_map { |u| get_sitemap_urls(url: u) } + else + page.sitemap_urls + end + end + + def to_base_url(url) + uri = url + uri = URI.parse(url) unless url.is_a?(URI) + + "#{uri.scheme}://#{uri.host}" + end + end +end diff --git a/lib/spidr/page.rb b/lib/spidr/page.rb index 4275940a..88926436 100644 --- a/lib/spidr/page.rb +++ b/lib/spidr/page.rb @@ -142,7 +142,7 @@ def method_missing(name,*arguments,&block) return super(name,*arguments,&block) end - + end end @@ -150,3 +150,4 @@ def method_missing(name,*arguments,&block) require 'spidr/page/content_types' require 'spidr/page/cookies' require 'spidr/page/html' +require 'spidr/page/sitemap' diff --git a/lib/spidr/page/content_types.rb b/lib/spidr/page/content_types.rb index 21a1ef54..929f3c3d 100644 --- a/lib/spidr/page/content_types.rb +++ b/lib/spidr/page/content_types.rb @@ -221,5 +221,15 @@ def pdf? def zip? is_content_type?('application/zip') end + + # + # Determines if the page is a Gzip archive. + # + # @return [Boolean] + # Specifies whether the page is a Gzip archive. + # + def gzip? + is_content_type?('application/gzip') + end end end diff --git a/lib/spidr/page/sitemap.rb b/lib/spidr/page/sitemap.rb new file mode 100644 index 00000000..8a4bab3d --- /dev/null +++ b/lib/spidr/page/sitemap.rb @@ -0,0 +1,194 @@ +require 'nokogiri' +require 'zlib' + +module Spidr + class Page + include Enumerable + + # + # Enumerates over the links in the sitemap page. + # + # @yield [link] + # If a block is given, it will be passed every link in the + # sitemap page. + # + # @yieldparam [String] link + # A URL from the sitemap page. + # + # @return [Enumerator] + # If no block is given, an enumerator object will be returned. + def each_sitemap_link + return enum_for(__method__) unless block_given? + + each_extracted_sitemap_links('url') { |url| yield(url) } + end + + # + # Return all links defined in Sitemap. + # + # @return [Array] + # of links defined in Sitemap. + def sitemap_links + each_sitemap_link.to_a + end + + # + # Enumerates over the Sitemap index links in the sitemap page. + # + # @yield [link] + # If a block is given, it will be passed every link in the + # sitemap page. + # + # @yieldparam [String] link + # A URL from the sitemap page. + # + # @return [Enumerator] + # If no block is given, an enumerator object will be returned. + def each_sitemap_index_link + return enum_for(__method__) unless block_given? + + each_extracted_sitemap_links('sitemap') { |url| yield(url) } + end + + # + # Return all Sitemap index links defined in sitemap. + # + # @return [Array] + # of links defined in Sitemap. + def sitemap_index_links + each_sitemap_index_link.to_a + end + + # + # Enumerates over the URLs in the sitemap page. + # + # @yield [url] + # If a block is given, it will be passed every URL in the + # sitemap page. + # + # @yieldparam [URI::HTTP, URI::HTTPS] url + # A URL from the sitemap page. + # + # @return [Enumerator] + # If no block is given, an enumerator object will be returned. + def each_sitemap_url + return enum_for(__method__) unless block_given? + + each_sitemap_link do |link| + if (url = to_absolute(link)) + yield url + end + end + end + + # + # Return all URLs defined in Sitemap. + # + # @return [Array, Array] + # of URLs defined in Sitemap. + def sitemap_urls + each_sitemap_url.to_a + end + + # + # Enumerates over the sitemap URLs in the sitemap page. + # + # @yield [url] + # If a block is given, it will be passed every sitemap URL in the + # sitemap page. + # + # @yieldparam [URI::HTTP, URI::HTTPS] url + # A sitemap URL from the sitemap page. + # + # @return [Enumerator] + # If no block is given, an enumerator object will be returned. + def each_sitemap_index_url + return enum_for(__method__) unless block_given? + + each_sitemap_index_link do |link| + if (url = to_absolute(link)) + yield url + end + end + end + + # + # Return all sitemap index URLs defined in Sitemap. + # + # @return [Array, Array] + # Sitemap index URLs defined in Sitemap. + def sitemap_index_urls + each_sitemap_index_url.to_a + end + + # + # Returns true if Sitemap is a Sitemap index. + # + # @return [Boolean] + def sitemap_index? + sitemap_root_name == 'sitemapindex' + end + + # + # Returns true if Sitemap is a regular list of URLs. + # + # @return [Boolean] + def sitemap_urlset? + sitemap_root_name == 'urlset' + end + + # + # Returns the document for the sitemap, if the content type is gzip it + # will be uncompressed. + # + # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil] + # The document that represents sitemap XML pages. + # Returns `nil` if the page is neither XML, gzipped XML or if + # the page could not be parsed properly. + # + # @see #doc + # + def sitemap_doc + return doc if doc && !gzip? + + begin + @sitemap_doc ||= Nokogiri::XML::Document.parse(unzipped_body, @url.to_s, content_charset) + rescue + end + end + + private + + def sitemap_root_name + return unless doc.root + + doc.root.name + end + + def each_extracted_sitemap_links(node_name) + if plain_text? + return unzipped_body.each_line { |url| yield(url.strip) } + end + + return unless sitemap_doc + + sitemap_doc.css("#{node_name} loc").each do |element| + yield(element.text) + end + end + + def unzipped_body + return body unless gzip? + + io = StringIO.new(body) + gz = Zlib::GzipReader.new(io) + body = gz.read + rescue Zlib::Error + '' + ensure + gz.close if gz + + body + end + end +end diff --git a/spec/agent/sitemap_spec.rb b/spec/agent/sitemap_spec.rb new file mode 100644 index 00000000..fd010497 --- /dev/null +++ b/spec/agent/sitemap_spec.rb @@ -0,0 +1,58 @@ +require 'spec_helper' +require 'example_app' + +require 'spidr/agent' + +describe Agent do + describe "sitemap" do + context "from common sitemap index path" do + include_context "example App" + + subject { described_class.new(host: host, sitemap: true) } + + app do + before do + content_type 'application/xml' + end + + get '/sitemap-index.xml' do + <<-SITEMAP_XML + + + + http://example.com/my-sitemap.xml + + + SITEMAP_XML + end + + get '/my-sitemap.xml' do + <<-SITEMAP_XML + + + + http://example.com/ + + + http://example.com/some-path + + + SITEMAP_XML + end + end + + before do + stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app) + end + + it 'should fetch all URLs in sitemap' do + urls = subject.sitemap_urls('http://example.com') + expected = [ + URI('http://example.com/'), + URI('http://example.com/some-path') + ] + expect(urls).to eq(expected) + end + end + end +end diff --git a/spec/page/content_types_spec.rb b/spec/page/content_types_spec.rb index bef17afe..be236d6c 100644 --- a/spec/page/content_types_spec.rb +++ b/spec/page/content_types_spec.rb @@ -147,4 +147,8 @@ describe "#zip?" do include_examples "Content-Type method", :zip?, 'application/zip' end + + describe "#gzip?" do + include_examples "Content-Type method", :gzip?, 'application/gzip' + end end diff --git a/spec/page/sitemap_spec.rb b/spec/page/sitemap_spec.rb new file mode 100644 index 00000000..4ad5ce10 --- /dev/null +++ b/spec/page/sitemap_spec.rb @@ -0,0 +1,230 @@ +require 'spec_helper' +require 'example_page' + +require 'zlib' +require 'spidr/page' + +describe Page do + include_context 'example Page' + let(:content_type) { 'application/xml' } + + let(:body) { %{} } + let(:sitemap_urls_xml) do + <<-SITEMAP_XML + + + + http://example.com/ + + + http://example.com/page + + + SITEMAP_XML + end + let(:sitemap_index_urls_xml) do + <<-SITEMAP_XML + + + + http://example.com/sitemap1.xml.gz + + + http://example.com/sitemap2.xml.gz + + + SITEMAP_XML + end + + describe '#each_sitemap_link' do + context 'when the page contains sitemap urls' do + let(:body) { sitemap_urls_xml } + + it 'should return an Array of links' do + expect { |b| subject.each_sitemap_link(&b) }.to yield_successive_args( + "http://#{host}/", + "http://#{host}/page" + ) + end + end + + context 'when the page contains gzipped sitemap urls' do + let(:content_type) { 'application/gzip' } + let(:body) do + io = StringIO.new.tap(&:binmode) + Zlib::GzipWriter.new(io, nil, nil).tap do |gz| + gz.write(sitemap_urls_xml) + gz.close + end + + io.string + end + + it 'should return an Array of links' do + expect { |b| subject.each_sitemap_link(&b) }.to yield_successive_args( + "http://#{host}/", + "http://#{host}/page" + ) + end + end + + context 'when the page contains no links' do + it do + expect { |b| + subject.each_sitemap_link(&b) + }.not_to yield_control + end + end + end + + describe '#sitemap_links' do + context 'when the page contains links' do + let(:body) { sitemap_urls_xml } + + it 'should return an Array of links' do + expect(subject.sitemap_links).to be == [ + "http://#{host}/", + "http://#{host}/page" + ] + end + end + + context 'when the page contains no links' do + it { expect(subject.sitemap_links).to be == [] } + end + end + + describe '#each_sitemap_index_link' do + context 'when the page contains sitemap urls' do + let(:body) { sitemap_index_urls_xml } + + it 'should return an Array of absolute URIs' do + expect { |b| subject.each_sitemap_index_link(&b) }.to yield_successive_args( + "http://#{host}/sitemap1.xml.gz", + "http://#{host}/sitemap2.xml.gz" + ) + end + end + + context 'when the page contains no links' do + it do + expect { |b| + subject.each_sitemap_index_link(&b) + }.not_to yield_control + end + end + end + + describe '#sitemap_index_links' do + context 'when the page contains links' do + let(:body) { sitemap_index_urls_xml } + + it 'should return an Array of absolute URIs' do + expect(subject.sitemap_index_links).to be == [ + "http://#{host}/sitemap1.xml.gz", + "http://#{host}/sitemap2.xml.gz" + ] + end + end + + context 'when the page contains no links' do + it { expect(subject.sitemap_index_links).to be == [] } + end + end + + describe '#each_sitemap_url' do + context 'when the page contains sitemap urls' do + let(:body) { sitemap_urls_xml } + + it 'should return an Array of absolute URIs' do + expect { |b| subject.each_sitemap_url(&b) }.to yield_successive_args( + URI("http://#{host}/"), + URI("http://#{host}/page") + ) + end + end + + context 'when the page contains gzipped sitemap urls' do + let(:content_type) { 'application/gzip' } + let(:body) do + io = StringIO.new.tap(&:binmode) + Zlib::GzipWriter.new(io, nil, nil).tap do |gz| + gz.write(sitemap_urls_xml) + gz.close + end + + io.string + end + + it 'should return an Array of absolute URIs' do + expect { |b| subject.each_sitemap_url(&b) }.to yield_successive_args( + URI("http://#{host}/"), + URI("http://#{host}/page") + ) + end + end + + context 'when the page contains no links' do + it do + expect { |b| + subject.each_sitemap_url(&b) + }.not_to yield_control + end + end + end + + describe '#sitemap_urls' do + context 'when the page contains links' do + let(:body) { sitemap_urls_xml } + + it 'should return an Array of absolute URIs' do + expect(subject.sitemap_urls).to be == [ + URI("http://#{host}/"), + URI("http://#{host}/page") + ] + end + end + + context 'when the page contains no links' do + it { expect(subject.sitemap_urls).to be == [] } + end + end + + describe '#each_sitemap_index_url' do + context 'when the page contains sitemap urls' do + let(:body) { sitemap_index_urls_xml } + + it 'should return an Array of absolute URIs' do + expect { |b| subject.each_sitemap_index_url(&b) }.to yield_successive_args( + URI("http://#{host}/sitemap1.xml.gz"), + URI("http://#{host}/sitemap2.xml.gz") + ) + end + end + + context 'when the page contains no links' do + it do + expect { |b| + subject.each_sitemap_index_url(&b) + }.not_to yield_control + end + end + end + + describe '#sitemap_index_urls' do + context 'when the page contains links' do + let(:body) { sitemap_index_urls_xml } + + it 'should return an Array of absolute URIs' do + expect(subject.sitemap_index_urls).to be == [ + URI("http://#{host}/sitemap1.xml.gz"), + URI("http://#{host}/sitemap2.xml.gz") + ] + end + end + + context 'when the page contains no links' do + it { expect(subject.sitemap_index_urls).to be == [] } + end + end +end