Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: global refacto #60

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
__pycache__
*__pycache__
*.pyc
*.ini
*.egg-info
*.json
dist
blc_venv
venv
18 changes: 4 additions & 14 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
.DEFAULT_GOAL=help

CONFIG_FILE=./conf.ini
VENVPATH=blc_venv
VENVPATH=venv
PYTHON=$(VENVPATH)/bin/python3
PIP=$(VENVPATH)/bin/pip

Expand All @@ -11,30 +10,21 @@ $(VENVPATH)/bin/activate: requirements.txt
. $(VENVPATH)/bin/activate; \
$(PIP) install -r requirements.txt

$(CONFIG_FILE):
echo "[-] adding config file..."
cp example.conf.ini $(CONFIG_FILE)

##install-deps: setup your dev environment
install-deps: venv $(CONFIG_FILE)
install-deps: venv

##run: run the api locally - ex: make run link="https://osscameroon.com"
run: install-deps
$(PYTHON) -m broken_link_checker $(link) --delay 1
$(PYTHON) -m blc $(link)

lint: install-deps
$(PYTHON) -m flake8 broken_link_checker --show-source --statistics
$(PYTHON) -m flake8 blc --show-source --statistics

build: install-deps
$(PYTHON) -m build

##test: test your code
test: build
$(PYTHON) -m unittest
ls dist/blc-*.whl | sort -r | grep . -m 1 > /tmp/last_package
$(PIP) install -r /tmp/last_package
PYTHON=$(PYTHON) NB_BROKEN_LINK_EXPECTED=23 sh tests/checker_test.sh
PYTHON=$(PYTHON) NB_BROKEN_LINK_EXPECTED=31 BLC_FLAGS="-n" sh tests/checker_test.sh

clean:
rm -rf $(VENVPATH) dist
Expand Down
49 changes: 1 addition & 48 deletions README-PYPI.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,54 +12,7 @@ This is a console project that will run as a cron job.

# Running the Project Locally

Create a virtual env:

*NB: Compatibility Python3.10+*

```bash
python3 -m venv blc_venv
```

Active the virtual env:
## Linux
```bash
source blc_venv/bin/activate
```
## Windows
```cmd
blc_venv\Scripts\activate.bat
```

Install dependency:

```bash
pip install --upgrade pip
pip install -i https://test.pypi.org/simple/ blc
```

Finally, run:

```bash
python -m broken_link_checker https://example.com --delay 1
```

To receive a report by email, you can use this command

```bash
python -m broken_link_checker https://example.com --delay 1 --sender <sender_email_address>\
--password <sender_password> --smptp_server <smtp_server:port> --recipient <recipient_email_address>
```

If also possible to specify a config file
NB: Refer to our default config file *broken_link_checker/conf.ini* to knw how to write it.
```bash
cp example.conf.ini conf.ini
```

Apply your modifications and run the program
```bash
python -m broken_link_checker -c conf.ini
```
-- documentation need to be rite here for the Makefile --

*NB:* Some email service provider ask to enable some settings to allow less secure apps.

Expand Down
40 changes: 2 additions & 38 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,45 +12,9 @@ This is a console project that will run as a cron job.

# Running the Project Locally

## For normal usage
-- documentation need to be rite here for the Makefile --

[README](README-PYPI.md)

## For developer only

First, clone the repository to your local machine:
```bash
git clone https://github.com/osscameroon/broken_link_checker.git
```

Create a virtual env:

*NB: Compatibility Python3.10+*
```bash
python3 -m venv blc_venv
```

Active the virtual env:
```bash
source blc_venv/bin/activate
```

Install dependencies:
```bash
make install-deps
```

Build the package
```bash
make build
```

For the next step confer *normal usage*

If you want run the tests, use this command
```bash
make test
```
*NB:* Some email service provider ask to enable some settings to allow less secure apps.

# License
MIT
File renamed without changes.
202 changes: 202 additions & 0 deletions blc/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
"""
main module of blc
"""
import json
from urllib.parse import urljoin, urlparse
import bs4
from requests.models import Response
from requests_html import HTMLSession

import colorama
from bs4 import BeautifulSoup

# init the colorama module
colorama.init()

GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET
YELLOW = colorama.Fore.YELLOW
CYAN = colorama.Fore.CYAN
RED = colorama.Fore.RED
WHITE = colorama.Fore.WHITE

# initialize the set of links (unique links)
internal_urls = set()
external_urls = set()
good_urls = set()
bad_urls = set()

total_urls_visited = 0


def is_valid(url: str):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)


def request_get(url: str) -> Response:
"""
The simple requests get function
"""
# initialize an HTTP session
session = HTMLSession()
# make HTTP request & retrieve response
response = session.get(url)
# execute Javascript
try:
getattr(response, 'html').render()
except Exception:
pass

return response


def get_links_from_hrefs(
soup: bs4.BeautifulSoup,
domain_name: str,
urls: set
) -> set:
"""
Extract from soup object, urls from href
"""
for a_tag in soup.findAll('a'):
href = a_tag.attrs.get('href')
if href == '' or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = (
f'{parsed_href.scheme}://{parsed_href.netloc}{parsed_href.path}'
)
if 'mailto://' in href:
# For a mail link
print(f'{CYAN}[*] Mail link not checked : {href}{RESET}')
continue
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if domain_name not in href:
# external link
if href not in external_urls:
print(f' {GRAY}[!] External link: {href}{RESET}')
external_urls.add(href)
continue
print(f' {WHITE}[*] Internal link: {href}{RESET}')
urls.add(href)
internal_urls.add(href)

urls = urls.union(get_all_website_links(href))

return urls


def get_all_website_links(url: str) -> set:
"""
Returns all URLs that is found on `url` in which
it belongs to the same website
"""
# all URLs of `url`
urls = set()
# domain name of the URL without the protocol
domain_name = urlparse(url).netloc
try:
resp = request_get(url)
if '40' in str(resp.status_code):
bad_urls.add(json.dumps({
'reason': resp.status_code,
'url': url
}))
print(
f'{RED}[✗] unreacheable: ({resp.status_code}) <> {url}{RESET}'
)
return urls
except ConnectionError:
bad_urls.add(json.dumps({
'reason': 404,
'url': url
}))
print(
f'{RED}[✗] unreacheable: (404) <> {url}{RESET}'
)
return urls

print(f'{GREEN}[✓] <> {url}{RESET}')
good_urls.add(url)
soup = BeautifulSoup(getattr(resp, 'html').html, 'html.parser')
urls = get_links_from_hrefs(soup, domain_name, urls)

return urls


def crawl(url: str, max_urls: int = 30):
"""
Crawls a web page and extracts all links.
You'll find all links in `external_urls` and `internal_urls`
global set variables.
params:
max_urls (int): number of max urls to crawl, default is 30.
"""
global total_urls_visited
total_urls_visited += 1
links = get_all_website_links(url)
for link in links:
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)


def generate_report():
"""
This function will crawl the website and print links report
on the stdout and also inside the final json file
"""
crawl(url, max_urls=max_urls)
print('[+] Internal links:', len(internal_urls))
print('[+] External links:', len(external_urls))
print('[+] Bad-links/Internal-links:', len(bad_urls))
print('[+] Good-links/Internal-links:', len(good_urls))
print('[+] Total URLs:', len(external_urls) + len(internal_urls))

domain_name = urlparse(url).netloc

report = {
'external-urls': list(external_urls),
'internal-urls': list(internal_urls),
'good-urls': list(good_urls),
'bad-urls': list(bad_urls)
}

# save the internal links to a file
with open(f'{domain_name}_report.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(report, indent=3))


if __name__ == '__main__':
import argparse

parser = argparse.ArgumentParser(
description='Link Extractor Tool with Python'
)
parser.add_argument('url', help='The URL to extract links from.')
parser.add_argument(
'-m',
'--max-urls',
help='Number of max URLs to crawl, default is 30.',
default=30,
type=int,
)

args = parser.parse_args()
url = args.url
max_urls = args.max_urls
generate_report()
5 changes: 5 additions & 0 deletions blc/notifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Notifier module."""


class Notifier:
""" Notifier """
Loading