Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simple Command Line Interface (CLI) #68

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 152 additions & 0 deletions bin/spidr
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#!/usr/bin/env ruby

# for dev purposes
require 'bundler/setup' if ENV['SPIDR_GEM_DEV']
require 'spidr'

require 'csv'
require 'optparse'

start_at = nil
header = false
columns = %w[url]
content_types = nil

# Spidr::Agent options
spidr_options = {}

opt_parser = nil

OptionParser.new do |parser|
opt_parser = parser

parser.banner = 'Usage: spidr [options] <url>'
parser.default_argv = ARGV

parser.on('--columns=[val1,val2]', Array, 'Columns in output') do |value|
columns = value || columns
end

parser.on('--content-types=[val1,val2]', Array, 'Formats to output (html, javascript, css, json, ..)') do |value|
content_types = value
end

parser.on('--[no-]header', 'Include the header') do |value|
header = value
end

# Spidr::Agent options
parser.on('--open-timeout=val', Integer, 'Optional open timeout') do |value|
spidr_options[:open_timeout] = value
end

parser.on('--read-timeout=val', Integer, 'Optional read timeout') do |value|
spidr_options[:read_timeout] = value
end

parser.on('--ssl-timeout=val', Integer, 'Optional ssl timeout') do |value|
spidr_options[:ssl_timeout] = value
end

parser.on('--continue-timeout=val', Integer, 'Optional continue timeout') do |value|
spidr_options[:continue_timeout] = value
end

parser.on('--keep-alive-timeout=val', Integer, 'Optional keep_alive timeout') do |value|
spidr_options[:keep_alive_timeout] = value
end

parser.on('--proxy-host=val', String, 'The host the proxy is running on') do |value|
spidr_options.fetch(:proxy, {})[:host] = value
end

parser.on('--proxy-port=val', Integer, 'The port the proxy is running on') do |value|
spidr_options.fetch(:proxy, {})[:port] = value
end

parser.on('--proxy-user=val', String, 'The user to authenticate as with the proxy') do |value|
spidr_options.fetch(:proxy, {})[:user] = value
end

parser.on('--proxy-password=val', String, 'The password to authenticate with') do |value|
spidr_options.fetch(:proxy, {})[:password] = value
end

parser.on('--default-headers=[key1=val1,key2=val2]', Array, 'Default headers to set for every request') do |value|
spidr_options[:default_headers] = (value || []).map { |v| v.split('=') }.to_h
end

parser.on('--host-header=val', String, 'The HTTP Host header to use with each request') do |value|
spidr_options[:host_header] = value
end

parser.on('--host-headers=[key1=val1,key2=val2]', Array, 'The HTTP Host headers to use for specific hosts') do |value|
spidr_options[:host_headers] = (value || []).map { |v| v.split('=') }.to_h
end

parser.on('--user-agent=val', String, 'The User-Agent string to send with each requests') do |value|
spidr_options[:user_agent] = value
end

parser.on('--referer=val', String, 'The Referer URL to send with each request') do |value|
spidr_options[:referer] = value
end

parser.on('--delay=val', Integer, 'The number of seconds to pause between each request') do |value|
spidr_options[:delay] = value
end

parser.on('--queue=[val1,val2]', Array, 'The initial queue of URLs to visit') do |value|
spidr_options[:queue] = value
end

parser.on('--history=[val1,val2]', Array, 'The initial list of visited URLs') do |value|
spidr_options[:history] = value
end

parser.on('--limit=val', Integer, 'The maximum number of pages to visit') do |value|
spidr_options[:limit] = value
end

parser.on('--max-depth=val', Integer, 'The maximum link depth to follow') do |value|
spidr_options[:max_depth] = value
end

parser.on('--[no-]robots', 'Respect Robots.txt') do |value|
spidr_options[:robots] = value
end

# Boilerplate CLI
parser.on('-h', '--help', 'How to use') do
puts parser
exit
end

parser.on_tail('--version', 'Show version') do
puts "Spidr version #{Spidr::VERSION}"
exit
end

# No argument, shows at tail. This will print an options summary.
parser.on_tail('-h', '--help', 'Show this message') do
puts parser
exit
end
end.parse!

start_at = ARGV.last
if start_at.nil? || start_at.empty?
puts opt_parser
raise(ArgumentError, "<url> can't be blank")
end

# main
puts CSV.generate_line(columns) if header
Spidr.site(start_at, spidr_options) do |spider|
spider.every_page do |page|
next if content_types && content_types.all? { |type| !page.is_content_type?(type) }

row = columns.map { |column| page.public_send(column) }
puts CSV.generate_line(row)
end
end