From a21486e5dc5a0e80a2fcdb07ebe6cff18450a32b Mon Sep 17 00:00:00 2001 From: Alexander Turenko Date: Mon, 23 Oct 2023 13:31:23 +0300 Subject: [PATCH] tools: add rws_files A simple crawler for https://rws.tarantool.org. --- tools/rws_files | 270 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100755 tools/rws_files diff --git a/tools/rws_files b/tools/rws_files new file mode 100755 index 0000000..ffa93de --- /dev/null +++ b/tools/rws_files @@ -0,0 +1,270 @@ +#!/usr/bin/env tarantool + +local json = require('json') +local fio = require('fio') +local log = require('log').new('rws-files') +local fiber = require('fiber') +local http_client = require('http.client') +local argparse = require('internal.argparse').parse + +-- {{{ General purpose utils + +-- NB: The dedent() function is copied from tarantool sources. + +-- Remove indent from a text. +-- +-- Similar to Python's textwrap.dedent(). +-- +-- It strips all newlines from beginning and all whitespace +-- characters from the end for convenience use with multiline +-- string literals ([[ <...> ]]). +local function dedent(s) + local lines = s:lstrip('\n'):rstrip():split('\n') + + local indent = math.huge + for _, line in ipairs(lines) do + if #line ~= 0 then + indent = math.min(indent, #line:match('^ *')) + end + end + + local res = {} + for _, line in ipairs(lines) do + table.insert(res, line:sub(indent + 1)) + end + return table.concat(res, '\n') +end + +-- }}} General purpose utils + +local params_ok, params = pcall(argparse, arg, { + {'help', 'boolean'}, + {'h', 'boolean'}, + {'quiet', 'boolean'}, + {'q', 'boolean'}, +}) + +-- {{{ Print usage, handle incorrect params. + +local RESET_TERM = '\x1B[0m' +local RED = '\x1B[31m' + +local function print_usage(stream) + local stream = stream or io.stdout + stream:write(dedent(([[ + Usage: %s [--help|-h] [--quiet|-q] [remote dir] + + The remote directory can be for example "/", "/release", + "/release/series-3" and so on. + + See https://rws.tarantool.org for the directory hierarchy. + ]]):format(arg[0])) .. '\n') + stream:flush() +end + +local function usage_error(fmt, ...) + local msg = fmt:format(...) + io.stderr:write(('%sError: %s%s\n\n'):format(RED, msg, RESET_TERM)) + print_usage(io.stderr) + os.exit(1) +end + +if not params_ok then + usage_error(tostring(params)) +end + +if #params > 1 then + usage_error('Only one positional parameter is expected, got %d.', #params) +end + +if params[1] ~= nil and not params[1]:startswith('/') then + usage_error('The remote directory must start from "/", got %q.', params[1]) +end + +if params.help or params.h then + print_usage() + os.exit() +end + +-- }}} Print usage, handle incorrect params. + +local base_url = 'https://rws.tarantool.org' + +local log_level = (params.quiet or params.q) and 'info' or 'debug' +log.cfg({ + modules = {['rws-files'] = log_level}, +}) + +local function visit(ctx) + local dir = table.remove(ctx.next_dirs, 1) + assert(dir:startswith('/'), dir) + ctx.in_progress[dir] = true + ctx.visited_dirs[dir] = true + + -- Fetch the page. + local url = base_url .. dir + log.debug('http get: %s', url) + local ok, res = pcall(http_client.get, url) + if not ok then + log.warn('http get error, retrying: %s', res) + + -- Return to the queue to try again. + table.insert(ctx.next_dirs, dir) + ctx.in_progress[dir] = nil + ctx.visited_dirs[dir] = nil + return + end + + -- We shouldn't download files. + -- + -- Sadly, we can't check it using HTTP OPTIONS request type, + -- because it return text/html unconditionally. + local content_type = res.headers['content-type'] + if not content_type:startswith('text/html') then + local err = 'URL %s gives a page with "content-type" = %q; should ' .. + 'be excluded' + error(err:format(url, content_type)) + end + + if res.status ~= 200 then + log.warn('http get status %d, retrying', res.status) + + -- Return to the queue to try again. + table.insert(ctx.next_dirs, dir) + ctx.in_progress[dir] = nil + ctx.visited_dirs[dir] = nil + return + end + + local body = res.body + + -- Parse and tags. + local t = {} + body:gsub('(.-)', function(tr) + table.insert(t, {}) + tr:gsub('(.-)', function(td) + table.insert(t[#t], td:strip()) + end) + end) + + -- Parse rows. + for i, row in ipairs(t) do + -- Skip header row. + if #row == 0 then + assert(i == 1) + goto continue + end + assert(#row == 4) + + local is_dir + + -- [PARENTDIR] + -- [DIR] + -- [DIR] + local img = t[i][1] + if img:match('/directory.png') ~= nil then + is_dir = true + elseif img:match('/file.png') ~= nil then + is_dir = false + else + assert(img:match('/back.png') ~= nil) + goto continue + end + + -- 1. Parent Directory + -- 2. dir + -- 3. file + -- + -- 1st is handled above. + local a = t[i][2] + a:gsub('