osscameroon · Sanix-Darker · Jul 12, 2022
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,6 @@
-__pycache__
+*__pycache__
 *.pyc
-*.ini
 *.egg-info
+*.json
 dist
-blc_venv
+venv
diff --git a/Makefile b/Makefile
@@ -1,7 +1,6 @@
 .DEFAULT_GOAL=help
 
-CONFIG_FILE=./conf.ini
-VENVPATH=blc_venv
+VENVPATH=venv
 PYTHON=$(VENVPATH)/bin/python3
 PIP=$(VENVPATH)/bin/pip
 
@@ -11,30 +10,21 @@ $(VENVPATH)/bin/activate: requirements.txt
 	. $(VENVPATH)/bin/activate; \
 	$(PIP) install -r requirements.txt
 
-$(CONFIG_FILE):
-	echo "[-] adding config file..."
-	cp example.conf.ini $(CONFIG_FILE)
-
 ##install-deps: setup your dev environment
-install-deps: venv $(CONFIG_FILE)
+install-deps: venv
 
 ##run: run the api locally - ex: make run link="https://osscameroon.com"
 run: install-deps
-	$(PYTHON) -m broken_link_checker $(link) --delay 1
+	$(PYTHON) -m blc $(link)
 
 lint: install-deps
-	$(PYTHON) -m flake8 broken_link_checker --show-source --statistics
+	$(PYTHON) -m flake8 blc --show-source --statistics
 
 build: install-deps
 	$(PYTHON) -m build
 
 ##test: test your code
 test: build
-	$(PYTHON) -m unittest
-	ls dist/blc-*.whl | sort -r | grep . -m 1 > /tmp/last_package
-	$(PIP) install -r /tmp/last_package
-	PYTHON=$(PYTHON) NB_BROKEN_LINK_EXPECTED=23 sh tests/checker_test.sh
-	PYTHON=$(PYTHON) NB_BROKEN_LINK_EXPECTED=31 BLC_FLAGS="-n" sh tests/checker_test.sh
 
 clean:
 	rm -rf $(VENVPATH) dist

diff --git a/README-PYPI.md b/README-PYPI.md
@@ -12,54 +12,7 @@ This is a console project that will run as a cron job.
 
 # Running the Project Locally
 
-Create a virtual env:
-
-*NB: Compatibility Python3.10+*
-
-```bash
-python3 -m venv blc_venv
-```
-
-Active the virtual env:
-## Linux
-```bash
-source blc_venv/bin/activate
-```
-## Windows
-```cmd
-blc_venv\Scripts\activate.bat
-```
-
-Install dependency:
-
-```bash
-pip install --upgrade pip
-pip install -i https://test.pypi.org/simple/ blc
-```
-
-Finally, run:
-
-```bash
-python -m broken_link_checker https://example.com --delay 1
-```
-
-To receive a report by email, you can use this command
-
-```bash
-python -m broken_link_checker https://example.com --delay 1 --sender <sender_email_address>\
- --password <sender_password> --smptp_server <smtp_server:port> --recipient <recipient_email_address>
-```
-
-If also possible to specify a config file
-NB: Refer to our default config file *broken_link_checker/conf.ini* to knw how to write it.
-```bash
-cp example.conf.ini conf.ini
-```
-
-Apply your modifications and run the program
-```bash
-python -m broken_link_checker -c conf.ini
-```
+-- documentation need to be rite here for the Makefile -- 
 
 *NB:* Some email service provider ask to enable some settings to allow less secure apps. 
 

diff --git a/README.md b/README.md
@@ -12,45 +12,9 @@ This is a console project that will run as a cron job.
 
 # Running the Project Locally
 
-## For normal usage
+-- documentation need to be rite here for the Makefile -- 
 
-[README](README-PYPI.md)
-
-## For developer only
-
-First, clone the repository to your local machine:
-```bash
-git clone https://github.com/osscameroon/broken_link_checker.git
-```
-
-Create a virtual env:
-
-*NB: Compatibility Python3.10+*
-```bash
-python3 -m venv blc_venv
-```
-
-Active the virtual env:
-```bash
-source blc_venv/bin/activate
-```
-
-Install dependencies:
-```bash
-make install-deps
-```
-
-Build the package
-```bash
-make build
-```
-
-For the next step confer *normal usage*
-
-If you want run the tests, use this command
-```bash
-make test
-```
+*NB:* Some email service provider ask to enable some settings to allow less secure apps. 
 
 # License
 MIT
diff --git a/broken_link_checker/__init__.py → blc/__init__.py b/broken_link_checker/__init__.py → blc/__init__.py
diff --git a/blc/__main__.py b/blc/__main__.py
@@ -0,0 +1,202 @@
+"""
+main module of blc
+"""
+import json
+from urllib.parse import urljoin, urlparse
+import bs4
+from requests.models import Response
+from requests_html import HTMLSession
+
+import colorama
+from bs4 import BeautifulSoup
+
+# init the colorama module
+colorama.init()
+
+GREEN = colorama.Fore.GREEN
+GRAY = colorama.Fore.LIGHTBLACK_EX
+RESET = colorama.Fore.RESET
+YELLOW = colorama.Fore.YELLOW
+CYAN = colorama.Fore.CYAN
+RED = colorama.Fore.RED
+WHITE = colorama.Fore.WHITE
+
+# initialize the set of links (unique links)
+internal_urls = set()
+external_urls = set()
+good_urls = set()
+bad_urls = set()
+
+total_urls_visited = 0
+
+
+def is_valid(url: str):
+    """
+    Checks whether `url` is a valid URL.
+    """
+    parsed = urlparse(url)
+    return bool(parsed.netloc) and bool(parsed.scheme)
+
+
+def request_get(url: str) -> Response:
+    """
+    The simple requests get function
+    """
+    # initialize an HTTP session
+    session = HTMLSession()
+    # make HTTP request & retrieve response
+    response = session.get(url)
+    # execute Javascript
+    try:
+        getattr(response, 'html').render()
+    except Exception:
+        pass
+
+    return response
+
+
+def get_links_from_hrefs(
+    soup: bs4.BeautifulSoup,
+    domain_name: str,
+    urls: set
+) -> set:
+    """
+    Extract from soup object, urls from href
+    """
+    for a_tag in soup.findAll('a'):
+        href = a_tag.attrs.get('href')
+        if href == '' or href is None:
+            # href empty tag
+            continue
+        # join the URL if it's relative (not absolute link)
+        href = urljoin(url, href)
+        parsed_href = urlparse(href)
+        # remove URL GET parameters, URL fragments, etc.
+        href = (
+            f'{parsed_href.scheme}://{parsed_href.netloc}{parsed_href.path}'
+        )
+        if 'mailto://' in href:
+            # For a mail link
+            print(f'{CYAN}[*] Mail link not checked : {href}{RESET}')
+            continue
+        if not is_valid(href):
+            # not a valid URL
+            continue
+        if href in internal_urls:
+            # already in the set
+            continue
+        if domain_name not in href:
+            # external link
+            if href not in external_urls:
+                print(f'   {GRAY}[!] External link: {href}{RESET}')
+                external_urls.add(href)
+            continue
+        print(f'   {WHITE}[*] Internal link: {href}{RESET}')
+        urls.add(href)
+        internal_urls.add(href)
+
+        urls = urls.union(get_all_website_links(href))
+
+    return urls
+
+
+def get_all_website_links(url: str) -> set:
+    """
+    Returns all URLs that is found on `url` in which
+    it belongs to the same website
+    """
+    # all URLs of `url`
+    urls = set()
+    # domain name of the URL without the protocol
+    domain_name = urlparse(url).netloc
+    try:
+        resp = request_get(url)
+        if '40' in str(resp.status_code):
+            bad_urls.add(json.dumps({
+                'reason': resp.status_code,
+                'url': url
+            }))
+            print(
+                f'{RED}[✗] unreacheable: ({resp.status_code}) <> {url}{RESET}'
+            )
+            return urls
+    except ConnectionError:
+        bad_urls.add(json.dumps({
+            'reason': 404,
+            'url': url
+        }))
+        print(
+            f'{RED}[✗] unreacheable: (404) <> {url}{RESET}'
+        )
+        return urls
+
+    print(f'{GREEN}[✓] <> {url}{RESET}')
+    good_urls.add(url)
+    soup = BeautifulSoup(getattr(resp, 'html').html, 'html.parser')
+    urls = get_links_from_hrefs(soup, domain_name, urls)
+
+    return urls
+
+
+def crawl(url: str, max_urls: int = 30):
+    """
+    Crawls a web page and extracts all links.
+    You'll find all links in `external_urls` and `internal_urls`
+    global set variables.
+    params:
+        max_urls (int): number of max urls to crawl, default is 30.
+    """
+    global total_urls_visited
+    total_urls_visited += 1
+    links = get_all_website_links(url)
+    for link in links:
+        if total_urls_visited > max_urls:
+            break
+        crawl(link, max_urls=max_urls)
+
+
+def generate_report():
+    """
+    This function will crawl the website and print links report
+    on the stdout and also inside the final json file
+    """
+    crawl(url, max_urls=max_urls)
+    print('[+] Internal links:', len(internal_urls))
+    print('[+] External links:', len(external_urls))
+    print('[+] Bad-links/Internal-links:', len(bad_urls))
+    print('[+] Good-links/Internal-links:', len(good_urls))
+    print('[+] Total URLs:', len(external_urls) + len(internal_urls))
+
+    domain_name = urlparse(url).netloc
+
+    report = {
+        'external-urls': list(external_urls),
+        'internal-urls': list(internal_urls),
+        'good-urls': list(good_urls),
+        'bad-urls': list(bad_urls)
+    }
+
+    # save the internal links to a file
+    with open(f'{domain_name}_report.json', 'w', encoding='utf-8') as f:
+        f.write(json.dumps(report, indent=3))
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser(
+            description='Link Extractor Tool with Python'
+    )
+    parser.add_argument('url', help='The URL to extract links from.')
+    parser.add_argument(
+        '-m',
+        '--max-urls',
+        help='Number of max URLs to crawl, default is 30.',
+        default=30,
+        type=int,
+    )
+
+    args = parser.parse_args()
+    url = args.url
+    max_urls = args.max_urls
+    generate_report()
diff --git a/blc/notifier.py b/blc/notifier.py
@@ -0,0 +1,5 @@
+"""Notifier module."""
+
+
+class Notifier:
+    """ Notifier """