-
-
Notifications
You must be signed in to change notification settings - Fork 109
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Sitemap XML support #69
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
require 'set' | ||
|
||
module Spidr | ||
class Agent | ||
# Common locations for Sitemap(s) | ||
COMMON_SITEMAP_LOCATIONS = %w[ | ||
sitemap.xml | ||
sitemap.xml.gz | ||
sitemap.gz | ||
sitemap_index.xml | ||
sitemap-index.xml | ||
sitemap_index.xml.gz | ||
sitemap-index.xml.gz | ||
].freeze | ||
|
||
# | ||
# Initializes the sitemap fetcher. | ||
# | ||
def initialize_sitemap | ||
@sitemap = true | ||
end | ||
|
||
# | ||
# Returns the URLs found as per the sitemap.xml spec. | ||
# | ||
# @return [Array<URI::HTTP>, Array<URI::HTTPS>] | ||
# The URLs found. | ||
# | ||
# @see https://www.sitemaps.org/protocol.html | ||
def sitemap_urls(url) | ||
return [] unless @sitemap | ||
base_url = to_base_url(url) | ||
|
||
if @robots | ||
if urls = @robots.other_values(base_url)['Sitemap'] | ||
return urls.flat_map { |u| get_sitemap_urls(url: u) } | ||
end | ||
end | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a clever way of populating the the queue using the Something like: agent.every_robots_txt do |robots| # RobotsTXT class
# check robots for a `Sitemap` entry
end
agent.every_sitemap_xml do |sitemap| # SitemapXML class
sitemap.urls.each { |url| agent.enqueue(url) }
end There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, I could add a |
||
|
||
COMMON_SITEMAP_LOCATIONS.each do |path| | ||
if (page = get_page("#{base_url}/#{path}")).code == 200 | ||
return get_sitemap_urls(page: page) | ||
end | ||
end | ||
|
||
[] | ||
end | ||
|
||
private | ||
|
||
def get_sitemap_urls(url: nil, page: nil) | ||
page = get_page(url) if page.nil? | ||
return [] unless page | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could be rewritten as a nested if url && !page
unless (page = get_page(url))
return []
end
end |
||
|
||
if page.sitemap_index? | ||
page.each_sitemap_index_url.flat_map { |u| get_sitemap_urls(url: u) } | ||
else | ||
page.sitemap_urls | ||
end | ||
end | ||
|
||
def to_base_url(url) | ||
uri = url | ||
uri = URI.parse(url) unless url.is_a?(URI) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's safe to use |
||
|
||
"#{uri.scheme}://#{uri.host}" | ||
end | ||
end | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,194 @@ | ||
require 'nokogiri' | ||
require 'zlib' | ||
|
||
module Spidr | ||
class Page | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel like there should be a |
||
include Enumerable | ||
|
||
# | ||
# Enumerates over the links in the sitemap page. | ||
# | ||
# @yield [link] | ||
# If a block is given, it will be passed every link in the | ||
# sitemap page. | ||
# | ||
# @yieldparam [String] link | ||
# A URL from the sitemap page. | ||
# | ||
# @return [Enumerator] | ||
# If no block is given, an enumerator object will be returned. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add extra |
||
def each_sitemap_link | ||
return enum_for(__method__) unless block_given? | ||
|
||
each_extracted_sitemap_links('url') { |url| yield(url) } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could also be simplified by passing in a |
||
end | ||
|
||
# | ||
# Return all links defined in Sitemap. | ||
# | ||
# @return [Array<String>] | ||
# of links defined in Sitemap. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sentence fragment. |
||
def sitemap_links | ||
each_sitemap_link.to_a | ||
end | ||
|
||
# | ||
# Enumerates over the Sitemap index links in the sitemap page. | ||
# | ||
# @yield [link] | ||
# If a block is given, it will be passed every link in the | ||
# sitemap page. | ||
# | ||
# @yieldparam [String] link | ||
# A URL from the sitemap page. | ||
# | ||
# @return [Enumerator] | ||
# If no block is given, an enumerator object will be returned. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add extra |
||
def each_sitemap_index_link | ||
return enum_for(__method__) unless block_given? | ||
|
||
each_extracted_sitemap_links('sitemap') { |url| yield(url) } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably can be simplified by passing in a |
||
end | ||
|
||
# | ||
# Return all Sitemap index links defined in sitemap. | ||
# | ||
# @return [Array<String>] | ||
# of links defined in Sitemap. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add extra |
||
def sitemap_index_links | ||
each_sitemap_index_link.to_a | ||
end | ||
|
||
# | ||
# Enumerates over the URLs in the sitemap page. | ||
# | ||
# @yield [url] | ||
# If a block is given, it will be passed every URL in the | ||
# sitemap page. | ||
# | ||
# @yieldparam [URI::HTTP, URI::HTTPS] url | ||
# A URL from the sitemap page. | ||
# | ||
# @return [Enumerator] | ||
# If no block is given, an enumerator object will be returned. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add extra |
||
def each_sitemap_url | ||
return enum_for(__method__) unless block_given? | ||
|
||
each_sitemap_link do |link| | ||
if (url = to_absolute(link)) | ||
yield url | ||
end | ||
end | ||
end | ||
|
||
# | ||
# Return all URLs defined in Sitemap. | ||
# | ||
# @return [Array<URI::HTTP>, Array<URI::HTTPS>] | ||
# of URLs defined in Sitemap. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sentence fragment. |
||
def sitemap_urls | ||
each_sitemap_url.to_a | ||
end | ||
|
||
# | ||
# Enumerates over the sitemap URLs in the sitemap page. | ||
# | ||
# @yield [url] | ||
# If a block is given, it will be passed every sitemap URL in the | ||
# sitemap page. | ||
# | ||
# @yieldparam [URI::HTTP, URI::HTTPS] url | ||
# A sitemap URL from the sitemap page. | ||
# | ||
# @return [Enumerator] | ||
# If no block is given, an enumerator object will be returned. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add extra |
||
def each_sitemap_index_url | ||
return enum_for(__method__) unless block_given? | ||
|
||
each_sitemap_index_link do |link| | ||
if (url = to_absolute(link)) | ||
yield url | ||
end | ||
end | ||
end | ||
|
||
# | ||
# Return all sitemap index URLs defined in Sitemap. | ||
# | ||
# @return [Array<URI::HTTP>, Array<URI::HTTPS>] | ||
# Sitemap index URLs defined in Sitemap. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add extra |
||
def sitemap_index_urls | ||
each_sitemap_index_url.to_a | ||
end | ||
|
||
# | ||
# Returns true if Sitemap is a Sitemap index. | ||
# | ||
# @return [Boolean] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add extra |
||
def sitemap_index? | ||
sitemap_root_name == 'sitemapindex' | ||
end | ||
|
||
# | ||
# Returns true if Sitemap is a regular list of URLs. | ||
# | ||
# @return [Boolean] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add extra |
||
def sitemap_urlset? | ||
sitemap_root_name == 'urlset' | ||
end | ||
|
||
# | ||
# Returns the document for the sitemap, if the content type is gzip it | ||
# will be uncompressed. | ||
# | ||
# @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil] | ||
# The document that represents sitemap XML pages. | ||
# Returns `nil` if the page is neither XML, gzipped XML or if | ||
# the page could not be parsed properly. | ||
# | ||
# @see #doc | ||
# | ||
def sitemap_doc | ||
return doc if doc && !gzip? | ||
|
||
begin | ||
@sitemap_doc ||= Nokogiri::XML::Document.parse(unzipped_body, @url.to_s, content_charset) | ||
rescue | ||
end | ||
end | ||
|
||
private | ||
|
||
def sitemap_root_name | ||
return unless doc.root | ||
|
||
doc.root.name | ||
end | ||
|
||
def each_extracted_sitemap_links(node_name) | ||
if plain_text? | ||
return unzipped_body.each_line { |url| yield(url.strip) } | ||
end | ||
|
||
return unless sitemap_doc | ||
|
||
sitemap_doc.css("#{node_name} loc").each do |element| | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we use XPath here since the file is XML? I think the XPath would be |
||
yield(element.text) | ||
end | ||
end | ||
|
||
def unzipped_body | ||
return body unless gzip? | ||
|
||
io = StringIO.new(body) | ||
gz = Zlib::GzipReader.new(io) | ||
body = gz.read | ||
rescue Zlib::Error | ||
'' | ||
ensure | ||
gz.close if gz | ||
|
||
body | ||
end | ||
end | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
require 'spec_helper' | ||
require 'example_app' | ||
|
||
require 'spidr/agent' | ||
|
||
describe Agent do | ||
describe "sitemap" do | ||
context "from common sitemap index path" do | ||
include_context "example App" | ||
|
||
subject { described_class.new(host: host, sitemap: true) } | ||
|
||
app do | ||
before do | ||
content_type 'application/xml' | ||
end | ||
|
||
get '/sitemap-index.xml' do | ||
<<-SITEMAP_XML | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> | ||
<sitemap> | ||
<loc>http://example.com/my-sitemap.xml</loc> | ||
</sitemap> | ||
</sitemapindex> | ||
SITEMAP_XML | ||
end | ||
|
||
get '/my-sitemap.xml' do | ||
<<-SITEMAP_XML | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> | ||
<url> | ||
<loc>http://example.com/</loc> | ||
</url> | ||
<url> | ||
<loc>http://example.com/some-path</loc> | ||
</url> | ||
</urlset> | ||
SITEMAP_XML | ||
end | ||
end | ||
|
||
before do | ||
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is redundant code. Already defined in spec/example_app.rb:22. |
||
end | ||
|
||
it 'should fetch all URLs in sitemap' do | ||
urls = subject.sitemap_urls('http://example.com') | ||
expected = [ | ||
URI('http://example.com/'), | ||
URI('http://example.com/some-path') | ||
] | ||
expect(urls).to eq(expected) | ||
end | ||
end | ||
end | ||
end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add extra
#
.