diff --git a/.gitignore b/.gitignore index d6ab7dc..35e70b0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ -__pycache__ +*__pycache__ *.pyc -*.ini *.egg-info +*.json dist -blc_venv \ No newline at end of file +venv diff --git a/Makefile b/Makefile index b287189..bbb2948 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,6 @@ .DEFAULT_GOAL=help -CONFIG_FILE=./conf.ini -VENVPATH=blc_venv +VENVPATH=venv PYTHON=$(VENVPATH)/bin/python3 PIP=$(VENVPATH)/bin/pip @@ -11,30 +10,21 @@ $(VENVPATH)/bin/activate: requirements.txt . $(VENVPATH)/bin/activate; \ $(PIP) install -r requirements.txt -$(CONFIG_FILE): - echo "[-] adding config file..." - cp example.conf.ini $(CONFIG_FILE) - ##install-deps: setup your dev environment -install-deps: venv $(CONFIG_FILE) +install-deps: venv ##run: run the api locally - ex: make run link="https://osscameroon.com" run: install-deps - $(PYTHON) -m broken_link_checker $(link) --delay 1 + $(PYTHON) -m blc $(link) lint: install-deps - $(PYTHON) -m flake8 broken_link_checker --show-source --statistics + $(PYTHON) -m flake8 blc --show-source --statistics build: install-deps $(PYTHON) -m build ##test: test your code test: build - $(PYTHON) -m unittest - ls dist/blc-*.whl | sort -r | grep . -m 1 > /tmp/last_package - $(PIP) install -r /tmp/last_package - PYTHON=$(PYTHON) NB_BROKEN_LINK_EXPECTED=23 sh tests/checker_test.sh - PYTHON=$(PYTHON) NB_BROKEN_LINK_EXPECTED=31 BLC_FLAGS="-n" sh tests/checker_test.sh clean: rm -rf $(VENVPATH) dist diff --git a/README-PYPI.md b/README-PYPI.md index 8488d6c..865c76f 100644 --- a/README-PYPI.md +++ b/README-PYPI.md @@ -12,54 +12,7 @@ This is a console project that will run as a cron job. # Running the Project Locally -Create a virtual env: - -*NB: Compatibility Python3.10+* - -```bash -python3 -m venv blc_venv -``` - -Active the virtual env: -## Linux -```bash -source blc_venv/bin/activate -``` -## Windows -```cmd -blc_venv\Scripts\activate.bat -``` - -Install dependency: - -```bash -pip install --upgrade pip -pip install -i https://test.pypi.org/simple/ blc -``` - -Finally, run: - -```bash -python -m broken_link_checker https://example.com --delay 1 -``` - -To receive a report by email, you can use this command - -```bash -python -m broken_link_checker https://example.com --delay 1 --sender \ - --password --smptp_server --recipient -``` - -If also possible to specify a config file -NB: Refer to our default config file *broken_link_checker/conf.ini* to knw how to write it. -```bash -cp example.conf.ini conf.ini -``` - -Apply your modifications and run the program -```bash -python -m broken_link_checker -c conf.ini -``` +-- documentation need to be rite here for the Makefile -- *NB:* Some email service provider ask to enable some settings to allow less secure apps. diff --git a/README.md b/README.md index d9ffe0b..865c76f 100644 --- a/README.md +++ b/README.md @@ -12,45 +12,9 @@ This is a console project that will run as a cron job. # Running the Project Locally -## For normal usage +-- documentation need to be rite here for the Makefile -- -[README](README-PYPI.md) - -## For developer only - -First, clone the repository to your local machine: -```bash -git clone https://github.com/osscameroon/broken_link_checker.git -``` - -Create a virtual env: - -*NB: Compatibility Python3.10+* -```bash -python3 -m venv blc_venv -``` - -Active the virtual env: -```bash -source blc_venv/bin/activate -``` - -Install dependencies: -```bash -make install-deps -``` - -Build the package -```bash -make build -``` - -For the next step confer *normal usage* - -If you want run the tests, use this command -```bash -make test -``` +*NB:* Some email service provider ask to enable some settings to allow less secure apps. # License MIT diff --git a/broken_link_checker/__init__.py b/blc/__init__.py similarity index 100% rename from broken_link_checker/__init__.py rename to blc/__init__.py diff --git a/blc/__main__.py b/blc/__main__.py new file mode 100644 index 0000000..c9033c7 --- /dev/null +++ b/blc/__main__.py @@ -0,0 +1,202 @@ +""" +main module of blc +""" +import json +from urllib.parse import urljoin, urlparse +import bs4 +from requests.models import Response +from requests_html import HTMLSession + +import colorama +from bs4 import BeautifulSoup + +# init the colorama module +colorama.init() + +GREEN = colorama.Fore.GREEN +GRAY = colorama.Fore.LIGHTBLACK_EX +RESET = colorama.Fore.RESET +YELLOW = colorama.Fore.YELLOW +CYAN = colorama.Fore.CYAN +RED = colorama.Fore.RED +WHITE = colorama.Fore.WHITE + +# initialize the set of links (unique links) +internal_urls = set() +external_urls = set() +good_urls = set() +bad_urls = set() + +total_urls_visited = 0 + + +def is_valid(url: str): + """ + Checks whether `url` is a valid URL. + """ + parsed = urlparse(url) + return bool(parsed.netloc) and bool(parsed.scheme) + + +def request_get(url: str) -> Response: + """ + The simple requests get function + """ + # initialize an HTTP session + session = HTMLSession() + # make HTTP request & retrieve response + response = session.get(url) + # execute Javascript + try: + getattr(response, 'html').render() + except Exception: + pass + + return response + + +def get_links_from_hrefs( + soup: bs4.BeautifulSoup, + domain_name: str, + urls: set +) -> set: + """ + Extract from soup object, urls from href + """ + for a_tag in soup.findAll('a'): + href = a_tag.attrs.get('href') + if href == '' or href is None: + # href empty tag + continue + # join the URL if it's relative (not absolute link) + href = urljoin(url, href) + parsed_href = urlparse(href) + # remove URL GET parameters, URL fragments, etc. + href = ( + f'{parsed_href.scheme}://{parsed_href.netloc}{parsed_href.path}' + ) + if 'mailto://' in href: + # For a mail link + print(f'{CYAN}[*] Mail link not checked : {href}{RESET}') + continue + if not is_valid(href): + # not a valid URL + continue + if href in internal_urls: + # already in the set + continue + if domain_name not in href: + # external link + if href not in external_urls: + print(f' {GRAY}[!] External link: {href}{RESET}') + external_urls.add(href) + continue + print(f' {WHITE}[*] Internal link: {href}{RESET}') + urls.add(href) + internal_urls.add(href) + + urls = urls.union(get_all_website_links(href)) + + return urls + + +def get_all_website_links(url: str) -> set: + """ + Returns all URLs that is found on `url` in which + it belongs to the same website + """ + # all URLs of `url` + urls = set() + # domain name of the URL without the protocol + domain_name = urlparse(url).netloc + try: + resp = request_get(url) + if '40' in str(resp.status_code): + bad_urls.add(json.dumps({ + 'reason': resp.status_code, + 'url': url + })) + print( + f'{RED}[✗] unreacheable: ({resp.status_code}) <> {url}{RESET}' + ) + return urls + except ConnectionError: + bad_urls.add(json.dumps({ + 'reason': 404, + 'url': url + })) + print( + f'{RED}[✗] unreacheable: (404) <> {url}{RESET}' + ) + return urls + + print(f'{GREEN}[✓] <> {url}{RESET}') + good_urls.add(url) + soup = BeautifulSoup(getattr(resp, 'html').html, 'html.parser') + urls = get_links_from_hrefs(soup, domain_name, urls) + + return urls + + +def crawl(url: str, max_urls: int = 30): + """ + Crawls a web page and extracts all links. + You'll find all links in `external_urls` and `internal_urls` + global set variables. + params: + max_urls (int): number of max urls to crawl, default is 30. + """ + global total_urls_visited + total_urls_visited += 1 + links = get_all_website_links(url) + for link in links: + if total_urls_visited > max_urls: + break + crawl(link, max_urls=max_urls) + + +def generate_report(): + """ + This function will crawl the website and print links report + on the stdout and also inside the final json file + """ + crawl(url, max_urls=max_urls) + print('[+] Internal links:', len(internal_urls)) + print('[+] External links:', len(external_urls)) + print('[+] Bad-links/Internal-links:', len(bad_urls)) + print('[+] Good-links/Internal-links:', len(good_urls)) + print('[+] Total URLs:', len(external_urls) + len(internal_urls)) + + domain_name = urlparse(url).netloc + + report = { + 'external-urls': list(external_urls), + 'internal-urls': list(internal_urls), + 'good-urls': list(good_urls), + 'bad-urls': list(bad_urls) + } + + # save the internal links to a file + with open(f'{domain_name}_report.json', 'w', encoding='utf-8') as f: + f.write(json.dumps(report, indent=3)) + + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser( + description='Link Extractor Tool with Python' + ) + parser.add_argument('url', help='The URL to extract links from.') + parser.add_argument( + '-m', + '--max-urls', + help='Number of max URLs to crawl, default is 30.', + default=30, + type=int, + ) + + args = parser.parse_args() + url = args.url + max_urls = args.max_urls + generate_report() diff --git a/blc/notifier.py b/blc/notifier.py new file mode 100644 index 0000000..0f1efc0 --- /dev/null +++ b/blc/notifier.py @@ -0,0 +1,5 @@ +"""Notifier module.""" + + +class Notifier: + """ Notifier """ diff --git a/broken_link_checker/__main__.py b/broken_link_checker/__main__.py deleted file mode 100644 index d610d26..0000000 --- a/broken_link_checker/__main__.py +++ /dev/null @@ -1,143 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -"""Main module of the Broken Link Checker.""" - - -from argparse import ArgumentParser -from .checker import Checker -from .notifier import Notifier -from configparser import ConfigParser -import sys -import logging -import threading - -logging.basicConfig( - level=logging.DEBUG, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) - - -def main(args): - """Do something.""" - # parse values from a configuration file if provided and use those as the - # default values for the argparse arguments - config_argparse = ArgumentParser(add_help=False) - config_argparse.add_argument('-c', '--config-file', - help='path to configuration file') - config_argparse.add_argument('-D', '--debug', - help='enable the debug mode', - action='store_true') - config_args, _ = config_argparse.parse_known_args(args) - - defaults = { - "host": None, - "delay": None, - "sender": None, - "password": None, - "smtp_server": None, - "recipient": None, - } - - if not config_args.debug: - logging.disable(logging.CRITICAL) - - if config_args.config_file: - logging.info('Loading of the config file...') - try: - config_parser = ConfigParser() - with open(config_args.config_file) as f: - config_parser.read_file(f) - config_parser.read(config_args.config_file) - except OSError as err: - print(err) - sys.exit(1) - - defaults.update(dict(config_parser.items('Checker'))) - defaults.update(dict(config_parser.items('Notifier'))) - - # parse the program's main arguments using the dictionary of defaults and - # the previous parsers as "parent' parsers - parser = ArgumentParser( - parents=[config_argparse]) - parser.set_defaults(**defaults) - - if not defaults['host']: - parser.add_argument('host', type=str, - help='Eg: http://example.com') - parser.add_argument('-d', '--delay', type=float, - help='It represent the delay between each request') - parser.add_argument('-s', '--sender', type=str, - help='It represent the email used to send the report') - parser.add_argument('-p', '--password', type=str, - help='It represent the password used for the login') - parser.add_argument('-S', '--smtp_server', type=str, - help='It represent the email server used to send the report') - parser.add_argument('-r', '--recipient', type=str, - help='It represent the email where send the report') - parser.add_argument('-n', '--deep-scan', action='store_true', - help='Enable the deep scan') - args = parser.parse_args() - - # We verify the dependency - if not args.host: - parser.error('host is required') - elif (args.sender or args.password or args.smtp_server or args.recipient)\ - and not (args.sender and args.password and args.smtp_server and args.recipient): - parser.error('bad configuration of the notifier') - else: - pass - - checker_threads = [] - broken_url = {} - - for target in args.host.split(','): - # We initialize the checker - checker = Checker( - target, - delay=args.delay if args.delay is not None else 1.0, - deep_scan=args.deep_scan, - ) - # We config the shared dict - broken_url[target] = {} - checker.broken_url = broken_url[target] - - t = threading.Thread(target=checker.run) - checker_threads.append(t) - t.daemon = True - - # We initialize the notifier - notifier = Notifier( - smtp_server=args.smtp_server, - username=args.sender, - password=args.password, - ) - - # We start the checkers - for thread in checker_threads: - logging.info('Checking of %s' % args.host) - thread.start() - - # We wait for the completion - [thread.join() for thread in checker_threads] - - # We build the report - msg = 'Hello, the report of the broken link checker is ready.\n' - for target in broken_url: - msg += f"Report of {target}:\n" - if broken_url[target]: - for data in broken_url[target].items(): - msg += str(data) + '\n' - else: - msg += "No broken url found\n" - - # We verify if the email notifier is configured - if args.smtp_server: - # We notify the admin - logging.info('Sending of the report to %s...' % args.recipient) - notifier.send(subject='Broken links found', body=msg, recipient=args.recipient) - else: - print(msg) - - -if __name__ == '__main__': - main(sys.argv[1:]) diff --git a/broken_link_checker/checker.py b/broken_link_checker/checker.py deleted file mode 100644 index 0e22e46..0000000 --- a/broken_link_checker/checker.py +++ /dev/null @@ -1,233 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -"""Checker module.""" - -import requests -from urllib.parse import urljoin -import time -import logging -import re -import difflib - -# We change the log level for requests’s logger -logging.getLogger("requests").setLevel(logging.WARNING) - - -class Checker: - """ - Check if an broken URL is present inside a website. - - :host represent the website to check - :delay represent the delay between each request - :deep_scan enable the check of foreign url - just verify the availability of these URL - """ - - def __init__(self, host: str, delay: int = 1, deep_scan: bool = False): - """Init the checker.""" - # We config the logger - self.logging = logging.getLogger(f'checker({host})') - self.logging.setLevel(logging.DEBUG) - self.logging.debug('We initialize the checker for %s' % host) - - # We config the connection - self.conn = requests.session() - self.conn.headers.update({ - "User-Agent": "BrokenLinkChecker/1.0", - }) - - self.host = host - - # Delay between each request - self.delay = delay - - # Shallow scan of foreign url - self.deep_scan = deep_scan - - # Will represent the list of URL to check - self.url_to_check = [host] - - # Will represent the list of checked URL - self.checked_url = [] - - # Will represent the list of broken URL - self.broken_url = {} - - # Will represent the previous webpage content - self.prev_data = '' - - # Represent a regex to find all link URLs inside an text source - self.REGEX_TEXT_URL = re.compile( - r"href=[\'\"](.*?)[\'\"]" - r"|href=(.*?)[ |>]" - r"|(.*?)" - r"|(.*?)" - r"|src=[\'\"](.*?)[\'\"]" - r"|src=(.*?)[ |>]" - # Ref: http://www.regexguru.com/2008/11/detecting-urls-in-a-block-of-text/ - r"|\b(https?://[-A-Z0-9+&@#/%?=~_|!:,.;]*[A-Z0-9+&@#/%=~_|])", - re.IGNORECASE - ) - - # Regex to verify the content type - self.REGEX_CONTENT_TYPE = re.compile( - r"text/(xml|html)" - r"|application/(rss|xml)", - re.IGNORECASE - ) - - def is_same_host(self, url): - """ - Verify if the url belongs the host. - - :url the url to verify - """ - host = requests.utils.urlparse(self.host) - url = requests.utils.urlparse(url) - - if not url.scheme: - return True - elif url.scheme == host.scheme\ - and url.netloc == host.netloc\ - and url.port == host.port: - return True - else: - return False - - def check(self, url: str) -> requests.Response: - """ - Verify if a link is broken of not. - - :url represent the URL to check - """ - # We verify the URL is already checked - if url in self.checked_url: - return None - - self.logging.info('Checking of %s...' % url) - - # We mark the URL checked - self.checked_url.append(url) - - # We make a connection - try: - if self.is_same_host(url): - response = self.conn.get(url, timeout=2, stream=True) - else: - response = self.conn.head(url, timeout=2) - except requests.exceptions.ReadTimeout: - self.broken_url[url] = "Timeout!" - except requests.exceptions.ConnectionError: - self.broken_url[url] = "Connection aborted!" - except requests.exceptions.TooManyRedirects: - self.broken_url[url] = "Too many redirection!" - else: - # We verify the response status - # 2xx stand for request was successfully completed - if response.ok: - return response if self.is_same_host(url) else None - else: - self.broken_url[url] = response.reason - - self.logging.warning( - '%s maybe broken because status code: %i' % - (url, response.status_code) - ) - return None - - def update_list(self, response: requests.Response) -> None: - """ - Update the list of URL to checked in function of the URL get in a webpage. - - :response represent the http response who contains the data to analyze - """ - # We verify if the content is a webpage - if self.REGEX_CONTENT_TYPE.match(response.headers['Content-Type']): - self.logging.debug('Getting of the webpage...') - # we read max 2**20 bytes by precaution - response.raw.decode_content = True - data = response.raw.read(1048576) - self.logging.debug('Decoding of data...') - data = data.decode() - - # We verify if we are not already got this content in the previous request - if difflib.SequenceMatcher(None, data, self.prev_data).ratio() > 0.9: - self.logging.warning( - response.url + - ' skipped because content similar at +90% with the previous URL.' - ) - return - else: - self.prev_data = data - - self.logging.debug('Getting of the URLs...') - - matches = self.REGEX_TEXT_URL.findall(data) - - # In this step, we have two possibilities - # 1. The URL belongs to the HOST - # 1.1. The URL is absolute - # 1.2. The URL is relative - # 2. The URL don't belongs to the HOST - for match in matches: - # We get the URL match - url = [i for i in match if i] - if url: - url = url[0] - else: - continue - - # 1.1 and 1.2 - if self.is_same_host(url): - # 1.2 - if not requests.utils.parse_url(url).scheme: - # We verify if the URL is different of the parent - if not url.startswith('#') and not url.startswith('?'): - # We build the absolute URL - url = urljoin(response.url, url) - else: - # Since this URL is relative - # maybe it is not different of the parent - # Eg: /home and /home# - continue - else: - # 1.1 - pass - # 2 - elif self.deep_scan: - data = requests.utils.urlparse(url) - # Just the HTTP and HTTPS scheme will be allowed - if data.scheme in ['http', 'https']: - pass - else: - continue - else: - continue - - # Except if the deep_scan is enable - # At this point, the URL belongs to the HOST - # We verify that the URL is neither already added nor checked - if url not in self.url_to_check \ - and url not in self.checked_url \ - and url != response.url: - self.logging.debug('Add the URL %s' % url) - self.url_to_check.append(url) - else: - continue - - # We close the connection - response.close() - else: - self.logging.warning( - '%s ignored because Content-Type %s' % - (response.url, response.headers['Content-Type']) - ) - - def run(self) -> None: - """Run the checker.""" - # We check while we have an URL unchecked - while (self.url_to_check): - response = self.check(self.url_to_check.pop(0)) - if response: - self.update_list(response) - time.sleep(self.delay) diff --git a/broken_link_checker/notifier.py b/broken_link_checker/notifier.py deleted file mode 100644 index 433eda3..0000000 --- a/broken_link_checker/notifier.py +++ /dev/null @@ -1,52 +0,0 @@ -"""Notifier module.""" - -# Import smtplib for the actual sending function -import smtplib -# Here are the email package modules we'll need -from email.message import EmailMessage -import logging - - -class Notifier: - """ - Notify by email. - - :smtp_server represent the address of the email service provider - :username represent the email of the sender - :password represent the password of the sender - """ - - def __init__(self, smtp_server: str, username: str, password: str): - """Init the notifier.""" - # We config the module logger - self.logging = logging.getLogger('notifier') - self.logging.setLevel(logging.DEBUG) - self.logging.debug('We initialize the notifier') - - self.smtp_server = smtp_server - self.sender = username - self.password = password - - def send(self, recipient: str, subject: str, body: str) -> None: - """ - Send an email. - - :recipient represent the email of the dest - :subject represent the subject of the notification - :body represent the content of the notification - """ - self.logging.debug('We build the message') - # Create the container email message. - msg = EmailMessage() - msg['Subject'] = subject - msg['From'] = self.sender - msg['To'] = recipient - msg.set_content(body) - - self.logging.debug('We send the message') - # Send the message via our own SMTP server. - s = smtplib.SMTP(self.smtp_server) - s.starttls() - s.login(self.sender, self.password) - s.send_message(msg) - s.quit() diff --git a/example.conf.ini b/example.conf.ini deleted file mode 100644 index 33a3ac6..0000000 --- a/example.conf.ini +++ /dev/null @@ -1,10 +0,0 @@ -[Checker] -host=https://example.com -delay=1 -deep_scan=false - -[Notifier] -sender=example@example.com -password=example -smtp_server=smtp.example.com:123 -recipient=example@example.org \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4a4adc9..9cf64d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,4 @@ -build==0.7.0 +requests-html==0.10.0 +beautifulsoup4==4.11.1 flake8==4.0.1 -mccabe==0.6.1 -packaging==21.3 -pep517==0.12.0 -pycodestyle==2.8.0 -pyflakes==2.4.0 -pyparsing==3.0.8 -tomli==2.0.1 -requests +colorama==0.4.5 diff --git a/src/broken_link_checker/__init__.py b/src/broken_link_checker/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/__init__.py b/tests/__init__.py index 2864572..8693ec2 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1 +1,9 @@ -from .checker_test import CheckerTest \ No newline at end of file + +def test_assert(): + """ + Tests should be written here for each + function implemented in + __main__ + """ + + assert 1 == 1 diff --git a/tests/checker_test.py b/tests/checker_test.py deleted file mode 100644 index 7fb4fd9..0000000 --- a/tests/checker_test.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -"""Unit Test of the module checker.""" - -import unittest -from broken_link_checker.checker import Checker - - -class CheckerTest(unittest.TestCase): - """Unit Test of the module checker.""" - - def test_regex_with_html(self): - """Test for the html source.""" - checker = Checker('localhost') - - with open('tests/data.html', 'r') as f: - self.assertEqual( - len([ - i for i in checker.REGEX_TEXT_URL.findall( - f.read() - ) if any(i) - ]), - 14 - ) - - with open('tests/data2.html', 'r') as f: - self.assertEqual( - len([ - i for i in checker.REGEX_TEXT_URL.findall( - f.read() - ) if any(i) - ]), - 24 - ) - - def test_regex_with_xml(self): - """Test for the rss source.""" - checker = Checker('localhost') - - with open('tests/data.rss', 'r') as f: - self.assertEqual( - len([ - i for i in checker.REGEX_TEXT_URL.findall( - f.read() - ) if any(i) - ]), - 3 - ) - - def test_update_list(self): - """Test for the method update_list.""" - with open('tests/data.html', 'rb') as f1: - with open('tests/data2.html', 'rb') as f2: - with open('tests/data.rss', 'rb') as f3: - data = f1.read() + f2.read() + f3.read() - - class Response: - headers = {'Content-Type': 'text/html'} - url = 'http://localhost/' - - class raw: - def read(x): - return data - - def close(): - pass - - # without deep mode - checker = Checker(Response.url) - checker.update_list(Response) - - self.assertEqual(len(checker.url_to_check), 18) - - # with deep mode - checker = Checker('localhost', deep_scan=True) - checker.update_list(Response) - self.assertEqual(len(checker.url_to_check), 36) diff --git a/tests/checker_test.sh b/tests/checker_test.sh deleted file mode 100755 index c216f7c..0000000 --- a/tests/checker_test.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env bash -# -# Perform some verification on the broken link checker. -# NB: The purpose of this script is -# to verify the working of the checker on a real web server. - -HOST=localhost -PORT=8080 -counter=5 - -start_server() { - # We start the web server - $PYTHON tests/server.py $HOST $PORT & - # We get his pid - server_pid=$! - - # We wait the server to start - while [ $counter -gt 0 ]; do - sleep .1 - if curl -I $HOST:$PORT -s --show-error; then - break - else - echo Retry\($counter\) - counter=$(expr $counter - 1) - fi - done - - # We verify if the server is run - if [ $counter -eq 0 ]; then - return 1 - fi -} - -# We start the test -start_test() { - report=$($PYTHON -m broken_link_checker http://$HOST:$PORT -D -d 0 $BLC_FLAGS) - nb_broken_link_got=$(expr $(echo "$report" | grep -c .) - 2) - if [ ! $nb_broken_link_got -eq $NB_BROKEN_LINK_EXPECTED ]; then - echo "$NB_BROKEN_LINK_EXPECTED broken links expected, but $nb_broken_link_got got" - echo "REPORT:" - echo "$report" - return 2 - fi -} - -# We stop the server -stop_server() { - kill $server_pid -} - -if start_server; then - start_test -else - exit 1 -fi - -err_code=$? - -stop_server - -exit $err_code \ No newline at end of file diff --git a/tests/data.html b/tests/data.html deleted file mode 100644 index 06f1746..0000000 --- a/tests/data.html +++ /dev/null @@ -1,33 +0,0 @@ - - - - - - - - - - link text - Visit W3Schools.com! - Visit W3Schools! -

Absolute URLs

-

W3C

-

Google

- -

Relative URLs

-

HTML Images

-

CSS Tutorial

- - - HTML tutorial - - - Send email - - - Visit our HTML Tutorial - HTML tutorial - HTML tutorial - HTML tutorial - - \ No newline at end of file diff --git a/tests/data.rss b/tests/data.rss deleted file mode 100644 index e541dbe..0000000 --- a/tests/data.rss +++ /dev/null @@ -1,20 +0,0 @@ - - - - - W3Schools Home Page - https://www.w3schools.com - Free web building tutorials - - RSS Tutorial - https://www.w3schools.com/xml/xml_rss.asp - New RSS tutorial on W3Schools - - - XML Tutorial - https://www.w3schools.com/xml - New XML tutorial on W3Schools - - - - \ No newline at end of file diff --git a/tests/data2.html b/tests/data2.html deleted file mode 100644 index 1fff1c2..0000000 --- a/tests/data2.html +++ /dev/null @@ -1,81 +0,0 @@ - - - - - - -Example - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/tests/index.html b/tests/index.html deleted file mode 100644 index a2db41c..0000000 --- a/tests/index.html +++ /dev/null @@ -1,52 +0,0 @@ - - - - - - Test Page - - - - - Hello, i am home - Me again - Me again - Me again - Me again - Me again - Me again - Me again - - I works! - I works too! - Me also! - Don't forgot me - - Unfortunately, I works! - Me, No! - - I am a 404 error - I am a 500 error - Wait me forever - - - - - I works - - Empty - - I am null - - html - - rss - - Another html - - I am circular - I am circular - - Infinite redirect - - \ No newline at end of file diff --git a/tests/server.py b/tests/server.py deleted file mode 100644 index 4eda982..0000000 --- a/tests/server.py +++ /dev/null @@ -1,127 +0,0 @@ -"""This server is for test purpose.""" - -from http.server import BaseHTTPRequestHandler, HTTPServer -from socketserver import ThreadingMixIn -import time -import os -import sys - -path = os.path.dirname(os.path.realpath(__file__)) - -if len(sys.argv) < 2: - print("Host and Port required") -else: - hostName = sys.argv[1] - serverPort = int(sys.argv[2]) - print(sys.argv) - - -class ThreadingSimpleServer(ThreadingMixIn, HTTPServer): - """Threading server.""" - - pass - - -class MyServer(BaseHTTPRequestHandler): - """Define the behavior of the server.""" - - def do_GET(self): - """All test request will be on this method.""" - if '?' in self.path: - self.path = self.path.split('?')[0] - - if self.path in ['/', '/home', '/abc/']: - self.send_response(200) - self.send_header("Content-type", "text/html") - self.end_headers() - - with open(path+'/index.html', 'rb') as f: - self.wfile.write(f.read()) - elif self.path in ['/abc']: - self.send_response(404) - self.end_headers() - elif self.path == '/error': - self.send_response(500) - self.end_headers() - elif self.path == '/wait': - time.sleep(3600) - elif self.path == '/empty': - self.send_response(200) - self.send_header("Content-type", "text/html") - self.end_headers() - elif self.path == '/null': - pass - elif self.path == '/rss': - self.send_response(200) - self.send_header("Content-type", "application/rss+xml") - self.end_headers() - - with open(path+'/data.rss', 'rb') as f: - self.wfile.write(f.read()) - elif self.path == '/html': - self.send_response(200) - self.send_header("Content-type", "text/html") - self.end_headers() - - with open(path+'/data.html', 'rb') as f: - self.wfile.write(f.read()) - elif self.path == '/html/': - self.send_response(200) - self.send_header("Content-type", "text/html") - self.end_headers() - - with open(path+'/data2.html', 'rb') as f: - self.wfile.write(f.read()) - elif self.path.replace('/', '').isdigit(): - self.send_response(int(self.path.replace('/', ''))) - self.send_header("Content-type", "text/html") - self.end_headers() - elif self.path == '/good': - self.send_response(301) - self.send_header('Location', '/201') - self.end_headers() - elif self.path == '/c/i/r/c/u/l/a/r': - self.send_response(301) - self.send_header('Location', '/') - self.end_headers() - elif self.path.startswith('/c/i/r/c/u/l/a/r/'): - self.send_response(200) - self.send_header("Content-type", "text/html") - self.end_headers() - self.wfile.write(b""" - - - - - - - """) - elif self.path == '/iredirect': - self.send_response(301) - self.send_header('Location', '/iredirect') - self.end_headers() - elif self.path == '/bad': - self.send_response(301) - self.send_header('Location', '/401') - self.end_headers() - else: - self.send_response(404) - self.end_headers() - - def do_HEAD(self): - """Permit to check the availability of the test server.""" - self.send_response(200) - self.end_headers() - - -if __name__ == "__main__": - server = ThreadingSimpleServer((hostName, serverPort), MyServer) - print("Server started http://%s:%s" % (hostName, serverPort)) - - try: - server.serve_forever() - except KeyboardInterrupt: - pass - - server.server_close() - print("Server stopped.")