diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..dfe0770 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# Auto detect text files and perform LF normalization +* text=auto diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d9005f2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,152 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintainted in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..bcae6df --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Ivan Šincek + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..0b072f7 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +include src/chad/*.py +include src/chad_extractor/*.py +include src/dorks/*.txt +include src/templates/*.json diff --git a/README.md b/README.md new file mode 100644 index 0000000..f116342 --- /dev/null +++ b/README.md @@ -0,0 +1,436 @@ +# Chad + +Search Google Dorks like Chad. Based on [ivan-sincek/nagooglesearch](https://github.com/ivan-sincek/nagooglesearch). + +Tested on Kali Linux v2024.2 (64-bit). + +Made for educational purposes. I hope it will help! + +Future plans: + +* Chad Extractor: + * check if Playwright's Chromium headless browser is installed or not, + * add option to stop on rate limiting, + * find a way to bypass the auth. wall for `linkedin-user`. + +## Table of Contents + +* [How to Install](#how-to-install) + * [Install Playwright and Chromium](#install-playwright-and-chromium) + * [Standard Install](#standard-install) + * [Build and Install From the Source](#build-and-install-from-the-source) +* [Shortest Possible](#shortest-possible) +* [File Download](#file-download) +* [Chad Extractor](#chad-extractor) +* [Broken Link Hijacking](#broken-link-hijacking) + * [Single Site](#single-site) + * [Multiple Sites](#multiple-sites) + * [Analyzing the Report](#analyzing-the-report) + * [Rate Limiting](#rate-limiting) +* [Usage](#usage) +* [Images](#images) + +## How to Install + +### Install Playwright and Chromium + +```bash +pip3 install --upgrade playwright + +playwright install chromium +``` + +Make sure each time you upgrade your Playwright dependency to re-install Chromium; otherwise, you might get an error using the headless browser in Chad Extractor. + +### Standard Install + +```bash +pip3 install --upgrade google-chad +``` + +### Build and Install From the Source + +```bash +git clone https://github.com/ivan-sincek/chad && cd chad + +python3 -m pip install --upgrade build + +python3 -m build + +python3 -m pip install dist/google_chad-7.0-py3-none-any.whl +``` + +## Shortest Possible + +```bash +chad -q 'intitle:"index of /" intext:"parent directory"' +``` + +## File Download + +Did you say Metagoofil?! + +```bash +mkdir downloads + +chad -q "ext:pdf OR ext:docx OR ext:xlsx OR ext:pptx" -s *.example.com -tr 200 -dir downloads +``` + +Chad's file download feature is based on Python Requests dependency. + +## Chad Extractor + +Chad Extractor is a powerful tool based on [Scrapy's](https://scrapy.org) web crawler and [Playwright's](https://playwright.dev/python) Chromium headless browser, designed to efficiently scrape web content; unlike Python Requests dependency, which cannot render JavaScript encoded HTML and is easily blocked by anti-bot solutions. + +Primarily, Chad Extractor is designed to extract and validate data from Chad results files. However, it can also be used to extract and validate data from plaintext files by using the `-pt` option. + +If the `-pt` option is used, plaintext files will be treated like server responses, and the extraction logic will be applied, followed by validation. This is also useful if you want to re-test previous Chad Extractor's reports, e.g., by using `-res report.json -pt -o retest.json`. + +## Broken Link Hijacking + +Prepare the Google Dorks as [social_media_dorks.txt](https://github.com/ivan-sincek/chad/blob/main/src/dorks/social_media_dorks.txt) file: + +```fundamental +intext:"t.me/" +intext:"discord.com/invite/" OR intext:"discord.gg/invite/" +intext:"youtube.com/c/" OR intext:"youtube.com/channel/" +intext:"twitter.com/" OR intext:"x.com/" +intext:"facebook.com/" +intext:"instagram.com/" +intext:"tiktok.com/" +intext:"linkedin.com/in/" OR intext:"linkedin.com/company/" +``` + +Prepare the template as [social_media_template.json](https://github.com/ivan-sincek/chad/blob/main/src/templates/social_media_template.json) file: + +```json +{ + "telegram":{ + "extract":"t\\.me\\/(?:(?!(?:share)(?:(?:\\/|\\?|\\\\|\"|\\<)*$|(?:\\/|\\?|\\\\|\\\"|\\<)[\\s\\S]))[\\w\\d\\.\\_\\-\\+\\@]+)(?" + }, + "discord":{ + "extract":"discord\\.(?:com|gg)\\/invite\\/[\\w\\d\\.\\_\\-\\+\\@]+(?", + "validate_cookies":{ + "SOCS":"CAESEwgDEgk2OTk3ODk2MzcaAmVuIAEaBgiAn5S6Bg" + } + }, + "twitter":{ + "extract":"(?<=(?Table 1 - Template Attributes

+ +### Single Site + +```bash +chad -q social_media_dorks.txt -s *.example.com -tr 200 -pr 100 -o results.json + +chad-extractor -t social_media_template.json -res results.json -o report.json +``` + +### Multiple Sites + +Prepare the domains / subdomains as `sites.txt` file, the same way you would use them with the `site:` option in Google: + +```fundamental +*.example.com +*.example.com -www +``` + +Run: + +```bash +mkdir chad_results + +IFS=$'\n'; count=0; for site in $(cat sites.txt); do count=$((count+1)); echo "#${count} | ${site}"; chad -q social_media_dorks.txt -s "${site}" -tr 200 -pr 100 -o "chad_results/results_${count}.json"; done + +chad-extractor -t social_media_template.json -res chad_results -o report.json -v +``` + +### Analyzing the Report + +Manually verify if the broken social media URLs in `results[summary][validated]` are vulnerable to takeover: + +```json +{ + "started_at":"2023-12-23 03:30:10", + "ended_at":"2023-12-23 04:20:00", + "summary":{ + "validated":[ + "https://t.me/does_not_exist" // might be vulnerable to takeover + ], + "extracted":[ + "https://discord.com/invite/exists", + "https://t.me/does_not_exist", + "https://t.me/exists" + ] + }, + "failed":{ + "validation":[], + "extraction":[] + }, + "full":[ + { + "url":"https://example.com/about", + "results":{ + "telegram":[ + "https://t.me/does_not_exist", + "https://t.me/exists" + ], + "discord":[ + "https://discord.com/invite/exists" + ] + } + } + ] +} +``` + +### Rate Limiting + +Google's cooling-off period can range from a few hours to a whole day. + +To avoid hitting Google's rate limits with Chad, increase the minimum and maximum sleep between Google queries and/or pages; or use free or paid proxies. However, free proxies are often blocked and unstable. + +To download a list of free proxies, run: + +```bash +curl -s 'https://proxylist.geonode.com/api/proxy-list?limit=50&page=1&sort_by=lastChecked&sort_type=desc' -H 'Referer: https://proxylist.geonode.com/' | jq -r '.data[] | "\(.protocols[])://\(.ip):\(.port)"' > proxies.txt +``` + +**If you are using proxies, you might want to increase the request timeout, as responses will need longer time to arrive.** + +Additionally, to avoid hitting rate limits on platforms like [Instagram's](https://www.instagram.com) while using Chad Extractor, consider decreasing the number of concurrent requests per domain and increasing the sleep and wait times. + +## Usage + +```fundamental +Chad v7.0 ( github.com/ivan-sincek/chad ) + +Usage: chad -q queries [-s site ] [-x proxies ] [-o out ] +Example: chad -q queries.txt [-s *.example.com] [-x proxies.txt] [-o results.json] + +DESCRIPTION + Search Google Dorks like Chad +QUERIES + File containing Google Dorks or a single query to use + -q, --queries = queries.txt | intext:password | "ext:tar OR ext:zip" | etc. +SITE + Domain[s] to search + -s, --site = example.com | sub.example.com | *.example.com | "*.example.com -www" | etc. +TIME + Get results not older than the specified time in months + -t, --time = 6 | 12 | 24 | etc. +TOTAL RESULTS + Total number of unique results + Default: 100 + -tr, --total-results = 200 | etc. +PAGE RESULTS + Number of results per page - capped at 100 by Google + Default: randint(70, 100) + -pr, --page-results = 50 | etc. +MINIMUM QUERIES + Minimum sleep time in seconds between Google queries + Default: 75 + -min-q, --minimum-queries = 120 | etc. +MAXIMUM QUERIES + Maximum sleep time between Google queries + Default: minimum + 50 + -max-q, --maximum-queries = 180 | etc. +MINIMUM PAGES + Minimum sleep time between Google pages + Default: 15 + -min-p, --minimum-pages = 30 | etc. +MAXIMUM PAGES + Maximum sleep time between Google pages + Default: minimum + 10 + -max-p, --maximum-pages = 60 | etc. +USER AGENTS + User agents to use + Default: random-all + -a, --user-agents = user_agents.txt | random(-all) | curl/3.30.1 | etc. +PROXIES + File containing web proxies or a single web proxy to use + -x, --proxies = proxies.txt | http://127.0.0.1:8080 | etc. +DIRECTORY + Downloads directory + All downloaded files will be saved in this directory + -dir, --directory = downloads | etc. +THREADS + Number of files to download in parallel + Default: 5 + -th, --threads = 20 | etc. +OUT + Output file + -o, --out = results.json | etc. +NO SLEEP ON START + Disable the safety feature to prevent triggering rate limits by accident + -nsos, --no-sleep-on-start +DEBUG + Enable debug output + -dbg, --debug +``` + +```fundamental +Chad Extractor v7.0 ( github.com/ivan-sincek/chad ) + +Usage: chad-extractor -t template -res results -o out [-s sleep] [-rs random-sleep] +Example: chad-extractor -t template.json -res chad_results -o report.json [-s 1.5 ] [-rs ] + +DESCRIPTION + Extract and validate data from Chad results or plaintext files +TEMPLATE + File containing extraction and validation details + -t, --template = template.json | etc. +RESULTS + Directory containing Chad results or plaintext files, or a single file + If a directory is specified, files ending with '.report.json' will be ignored + -res, --results = chad_results | results.json | urls.txt | etc. +PLAINTEXT + Treat all the results as plaintext files / server responses + -pt, --plaintext +EXCLUDES + File containing regular expressions or a single regular expression to exclude content from the page + Applies only for extraction + -e, --excludes = regexes.txt | "
.+?<\/div>" | etc. +PLAYWRIGHT + Use Playwright's headless browser + Applies only for extraction + -p, --playwright +PLAYWRIGHT WAIT + Wait time in seconds before fetching the page content + Applies only for extraction + -pw, --playwright-wait = 0.5 | 2 | 4 | etc. +CONCURRENT REQUESTS + Number of concurrent requests + Default: 15 + -cr, --concurrent-requests = 30 | 45 | etc. +CONCURRENT REQUESTS PER DOMAIN + Number of concurrent requests per domain + Default: 5 + -crd, --concurrent-requests-domain = 10 | 15 | etc. +SLEEP + Sleep time in seconds between two consecutive requests to the same domain + -s, --sleep = 1.5 | 3 | etc. +RANDOM SLEEP + Randomize the sleep time between requests to vary between '0.5 * sleep' and '1.5 * sleep' + -rs, --random-sleep +AUTO THROTTLE + Auto throttle concurrent requests based on the load and latency + Sleep time is still respected + -at, --auto-throttle = 0.5 | 10 | 15 | 45 | etc. +RETRIES + Number of retries per URL + Default: 2 + -r, --retries = 0 | 4 | etc. +REQUEST TIMEOUT + Request timeout in seconds + Default: 60 + -rt, --request-timeout = 30 | 90 | etc. +USER AGENTS + User agents to use + Default: random-all + -a, --user-agents = user_agents.txt | random(-all) | curl/3.30.1 | etc. +PROXY + Web proxy to use + -x, --proxy = http://127.0.0.1:8080 | etc. +OUT + Output file + -o, --out = report.json | etc. +VERBOSE + Create additional supporting output files that end with '.report.json' + -v, --verbose +DEBUG + Enable debug output + -dbg, --debug +``` + +## Images + +

(Chad) File Download - Single Google Dork

+ +

Figure 1 - (Chad) File Download - Single Google Dork

+ +

(Chad) Broken Link Hijacking - Multiple Google Dorks

+ +

Figure 2 - (Chad) Broken Link Hijacking - Multiple Google Dorks

+ +

Extraction

+ +

Figure 3 - (Chad Extractor) Extraction

+ +

Validation

+ +

Figure 4 - (Chad Extractor) Validation

diff --git a/img/extraction.png b/img/extraction.png new file mode 100644 index 0000000..5b9fc5c Binary files /dev/null and b/img/extraction.png differ diff --git a/img/multiple_google_dorks.png b/img/multiple_google_dorks.png new file mode 100644 index 0000000..f2ffea8 Binary files /dev/null and b/img/multiple_google_dorks.png differ diff --git a/img/single_google_dork.png b/img/single_google_dork.png new file mode 100644 index 0000000..39761c2 Binary files /dev/null and b/img/single_google_dork.png differ diff --git a/img/validation.png b/img/validation.png new file mode 100644 index 0000000..6896088 Binary files /dev/null and b/img/validation.png differ diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b7bbb8e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,41 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "google-chad" +version = "7.0" +authors = [{ name = "Ivan Sincek" }] +description = "Not another Google Dorking tool." +readme = "README.md" +requires-python = ">=3.10" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent" +] +dependencies = [ + "alive-progress>=3.1.5", + "asyncio>=3.4.3", + "colorama>=0.4.6", + "nagooglesearch>=7.3", + "playwright>=1.47.0", + "regex>=2023.8.8", + "requests>=2.31.0", + "scrapy>=2.11.0", + "scrapy-playwright>=0.0.32", + "termcolor>=2.4.0" +] + +[project.urls] +"Homepage" = "https://github.com/ivan-sincek/chad" + +[project.scripts] +chad = "chad.main:main" +chad-extractor = "chad_extractor.main:main" + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +"*" = ["dorks/*.txt", "templates/*.json"] diff --git a/src/chad/main.py b/src/chad/main.py new file mode 100644 index 0000000..dc2ab25 --- /dev/null +++ b/src/chad/main.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 + +from .utils import chad, config, validate + +import datetime + +# ---------------------------------------- + +class Stopwatch: + + def __init__(self): + self.__start = datetime.datetime.now() + + def stop(self): + self.__end = datetime.datetime.now() + print(f"Script has finished in {self.__end - self.__start}") + +stopwatch = Stopwatch() + +# ---------------------------------------- + +def main(): + success, args = validate.Validate().validate_args() + if success: + config.banner() + tool = chad.Chad( + args.queries, + args.site, + args.time, + args.total_results, + args.page_results, + args.minimum_queries, + args.maximum_queries, + args.minimum_pages, + args.maximum_pages, + args.user_agents, + args.proxies, + not args.no_sleep_on_start, + args.debug + ) + if tool.prepare() and tool.run(): + if args.directory: + tool.download_files(args.threads, args.directory) + if args.out: + tool.save(args.out) + stopwatch.stop() + +if __name__ == "__main__": + main() diff --git a/src/chad/utils/array.py b/src/chad/utils/array.py new file mode 100644 index 0000000..6e472bc --- /dev/null +++ b/src/chad/utils/array.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 + +def unique(array: list): + """ + Remove duplicates from a list. + """ + seen = set() + return [x for x in array if not (x in seen or seen.add(x))] diff --git a/src/chad/utils/chad.py b/src/chad/utils/chad.py new file mode 100644 index 0000000..6f089b2 --- /dev/null +++ b/src/chad/utils/chad.py @@ -0,0 +1,293 @@ +#!/usr/bin/env python3 + +from . import array, file, general, grep, proxy + +import alive_progress, concurrent.futures, dataclasses, datetime, dateutil.relativedelta, nagooglesearch, random, requests, threading, time + +requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) + +# ---------------------------------------- + +@dataclasses.dataclass +class Google: + """ + Class for storing Google search details. + """ + query: str + proxy: str = "" + urls : list[str] = dataclasses.field(default_factory = list) + +# ---------------------------------------- + +class Chad: + + def __init__( + self, + queries : list[str], + site : str, + time : int, + total_results : int, + page_results : int, + minimum_queries: int, + maximum_queries: int, + minimum_pages : int, + maximum_pages : int, + user_agents : list[str], + proxies : list[str], + sleep_on_start : bool, + debug : bool + ): + """ + Class for Google searching. + """ + self.__queries = queries + self.__site = site + self.__tbs = self.__get_tbs(time) + self.__total_results = total_results + self.__page_results = page_results + self.__minimum_queries = minimum_queries + self.__maximum_queries = maximum_queries + self.__minimum_pages = minimum_pages + self.__maximum_pages = maximum_pages + self.__user_agents = user_agents + self.__user_agents_len = len(self.__user_agents) + self.__proxies = proxy.Proxies(proxies) + self.__sleep_on_start = sleep_on_start + self.__debug = debug + self.__debug_lock = threading.Lock() + self.__blacklist = grep.get_blacklist() + self.__results = [] + + def __get_tbs(self, time: int) -> str: + """ + Get a value for the 'to be searched' Google query parameter. + """ + tmp = "li:1" + if time: + now = datetime.datetime.today() + tmp = nagooglesearch.get_tbs(now, now - dateutil.relativedelta.relativedelta(months = time)) + return tmp + + def prepare(self): + """ + Validate Google Dorks, and, if applicable, prepend the specified site to each one. + """ + print(general.get_timestamp("Validating Google Dorks...")) + print("Google only allows Google Dorks up to 32 words in length, separated by spaces") + print("If the site is specified, Google Dorks containing the 'site:' operator will be ignored") + tmp = [] + ignored = [] + for query in self.__queries: + if self.__site: + if grep.has_site(query): + ignored.append(query) + continue + query = f"site:{self.__site} {query}" + if len(query.split(" ")) > 32: + ignored.append(query) + continue + tmp.append(query) + if ignored: + print(f"IGNORED GOOGLE DORKS: {len(ignored)}") + for query in ignored: + general.print_cyan(query) + if not tmp: + general.print_red("No valid Google Dorks were found!") + self.__queries = array.unique(tmp) + return self.__queries + + def run(self): + """ + Run a Google search. + """ + print(general.get_timestamp("Searching Google Dorks...")) + print("Press CTRL + C to exit early - results will be saved") + self.__results = [] + count = 0 + exit_program = False + try: + if self.__sleep_on_start: + self.__wait() + for query in self.__queries: + count += 1 + result = Google(query) + while not exit_program: + # -------------------- + if not self.__proxies.is_empty(): + if self.__proxies.is_round_robin(): + self.__wait() + result.proxy = self.__proxies.get() + elif count > 1: + self.__wait() + # -------------------- + self.__print_status(count, result) + search_parameters = { + "q" : result.query, + "tbs" : self.__tbs, + "hl" : "en", + "filter": "0", + "safe" : "images", + "num" : self.__get_num() + } + client = nagooglesearch.GoogleClient( + tld = "com", + search_parameters = search_parameters, + user_agent = self.__get_user_agent(), + proxy = result.proxy, + max_results = self.__total_results, + min_sleep = self.__minimum_pages, + max_sleep = self.__maximum_pages, + debug = self.__debug + ) + result.urls = client.search() + # -------------------- + if not grep.has_site(result.query): + result.urls = grep.filter_blacklist(result.urls, self.__blacklist) + if result.urls: + self.__results.append(result) + # -------------------- + error = client.get_error() + if error in ["REQUESTS_EXCEPTION", "429_TOO_MANY_REQUESTS"]: + general.print_yellow(error) + if result.proxy: + message = self.__proxies.remove(result.proxy) + if message: + print(message) + if self.__proxies.is_empty(): + general.print_red("All proxies has been exhausted!") + exit_program = True + break + else: + exit_program = True + break + else: + break + # -------------------- + if exit_program: + break + except KeyboardInterrupt: + pass + if not self.__results: + print("No results") + else: + print(general.jdump(self.__to_dict())) + return self.__results + + def __wait(self): + """ + Sleep for a random amount of time in seconds. + """ + seconds = random.randint(self.__minimum_queries, self.__maximum_queries) + print(f"Sleeping between Google Dorks for {seconds} sec...") + time.sleep(seconds) + + def __print_status(self, id: int, data: Google): + """ + Print the current status. + """ + text = f"QUERY {id}/{len(self.__queries)}: {data.query}" + if data.proxy: + text = f"{text} | PROXY: {data.proxy}" + general.print_green(general.get_timestamp(text)) + + def __get_num(self): + """ + Get the number of results per page as a string.\n + If not specified, return a random number between 70 and 100 as a string. + """ + return str(self.__page_results if self.__page_results > 0 else random.randint(70, 100)) + + def __get_headers(self): + """ + Get HTTP request headers. + """ + return { + "User-Agent": self.__get_user_agent(), + "Accept-Language": "en-US, *", + "Accept": "*/*", + "Referer": "https://www.google.com/", + "Upgrade-Insecure-Requests": "1" + } + + def __get_user_agent(self): + """ + Get a [random] user agent.\n + Returns an empty string if there are no user agents. + """ + user_agent = "" + if self.__user_agents_len > 0: + user_agent = self.__user_agents[random.randint(0, self.__user_agents_len - 1)] + return user_agent + + def __get_urls(self) -> list[str]: + """ + Combine all Google Dork result URLs into a single list. + """ + tmp = [] + for result in self.__results: + tmp.extend(result.urls) + tmp = array.unique(tmp) + random.shuffle(tmp) + return tmp + + def download_files(self, threads: int, directory: str): + """ + Download the content from all Google Dork result URLs into files.\n + Proxies are ignored. + """ + print(general.get_timestamp("Downloading files... Proxies are ignored")) + with alive_progress.alive_bar(len(self.__results), title = "Progress:") as bar: + with concurrent.futures.ThreadPoolExecutor(max_workers = threads) as executor: + subprocesses = [] + for url in self.__get_urls(): + subprocesses.append(executor.submit(self.__get, url, directory)) + for subprocess in concurrent.futures.as_completed(subprocesses): + result: file.File = subprocess.result() + if result.content and result.path: + file.write_binary_silent(result.content, result.path) + bar() + + def __get(self, url: str, downloads_directory: str): + """ + Get the content from a URL. + """ + tmp = file.File() + session = None + response = None + try: + session = requests.Session() + session.max_redirects = 10 + response = session.get(url, headers = self.__get_headers(), proxies = None, verify = False, allow_redirects = True, timeout = 30) + if response.status_code == 200: + tmp.content = response.content + tmp.path = file.get_url_filename(url, downloads_directory) + except (requests.exceptions.RequestException, requests.packages.urllib3.exceptions.HTTPError) as ex: + self.__print_debug(str(ex)) + finally: + if response: + response.close() + if session: + session.close() + return tmp + + def __print_debug(self, message: str): + """ + Print a debug message. + """ + if self.__debug: + with self.__debug_lock: + general.print_yellow(message) + + def save(self, out: str): + """ + Save the results in an output file.\n + If the output file exists, prompt to overwrite it. + """ + if self.__results: + file.overwrite(general.jdump(self.__to_dict()), out) + + def __to_dict(self): + """ + Convert an instance of a class into a dictionary. + """ + return [dataclasses.asdict(result) for result in self.__results] diff --git a/src/chad/utils/config.py b/src/chad/utils/config.py new file mode 100644 index 0000000..eb376d5 --- /dev/null +++ b/src/chad/utils/config.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 + +APP_VERSION = "v7.0" + +def banner(): + """ + Display the banner. + """ + print("#########################################################################") + print("# #") + print("# Chad v7.0 #") + print("# #") + print("# Search Google Dorks like Chad. #") + print("# GitHub repository at github.com/ivan-sincek/chad. #") + print("# #") + print("#########################################################################") diff --git a/src/chad/utils/directory.py b/src/chad/utils/directory.py new file mode 100644 index 0000000..b5faa36 --- /dev/null +++ b/src/chad/utils/directory.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python3 + +import os + +def is_directory(directory: str): + """ + Returns 'True' if 'directory' exists and is a regular directory. + """ + return os.path.isdir(directory) diff --git a/src/chad/utils/file.py b/src/chad/utils/file.py new file mode 100644 index 0000000..ccddc11 --- /dev/null +++ b/src/chad/utils/file.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 + +from . import array + +import dataclasses, os, urllib.parse + +__ENCODING = "ISO-8859-1" + +@dataclasses.dataclass +class File: + """ + Class for storing file details. + """ + content: bytes = b"" + path : str = "" + +def is_file(file: str): + """ + Returns 'True' if the 'file' exists and is a regular file. + """ + return os.path.isfile(file) + +def validate(file: str): + """ + Validate a file.\n + Success flag is 'True' if the file has a read permission and is not empty. + """ + success = False + message = "" + if not os.access(file, os.R_OK): + message = f"\"{file}\" does not have a read permission" + elif not os.stat(file).st_size > 0: + message = f"\"{file}\" is empty" + else: + success = True + return success, message + +def read_array(file: str) -> list[str]: + """ + Read a file line by line, and append the lines to a list.\n + Whitespace will be stripped from each line, and empty lines will be removed.\n + Returns a unique list. + """ + tmp = [] + with open(file, "r", encoding = __ENCODING) as stream: + for line in stream: + line = line.strip() + if line: + tmp.append(line) + return array.unique(tmp) + +def write_binary_silent(content: bytes, out: str): + """ + Silently write a binary content to an output file. + """ + try: + open(out, "wb").write(content) + except Exception: + pass + +def overwrite(text: str, out: str): + """ + Write a text to an output file.\n + If the output file exists, prompt to overwrite it. + """ + confirm = "yes" + if os.path.isfile(out): + print(f"'{out}' already exists") + confirm = input("Overwrite the output file (yes): ") + if confirm.lower() in ["yes", "y"]: + try: + open(out, "w").write(text) + print(f"Results have been saved to '{out}'") + except FileNotFoundError: + print(f"Cannot save the results to '{out}'") + +def get_url_filename(url: str, downloads_directory: str = ""): + """ + Derive a filename from a URL.\n + If a duplicate exists, append a unique number to the filename.\n + Returns the full path to the file. + """ + tmp = urllib.parse.urlsplit(url) + base = tmp.path.strip("/").rsplit("/", 1)[-1] + if not base: + base = tmp.netloc + base = os.path.join(downloads_directory, base) + count = 0 + filename = base + while os.path.isfile(filename): + count += 1 + filename = f"{base} ({count})" + return filename diff --git a/src/chad/utils/general.py b/src/chad/utils/general.py new file mode 100644 index 0000000..a1660e4 --- /dev/null +++ b/src/chad/utils/general.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 + +import colorama, datetime, json, termcolor, typing + +colorama.init(autoreset = True) + +def get_timestamp(message): + """ + Get the current timestamp. + """ + return f"{datetime.datetime.now().strftime('%H:%M:%S')} - {message}" + +def print_error(message: str): + """ + Print an error message. + """ + print(f"ERROR: {message}") + +def print_cyan(message: str): + """ + Print a message in cyan color. + """ + termcolor.cprint(message, "cyan") + +def print_green(message: str): + """ + Print a message in green color. + """ + termcolor.cprint(message, "green") + +def print_yellow(message: str): + """ + Print a message in yellow color. + """ + termcolor.cprint(message, "yellow") + +def print_red(message: str): + """ + Print a message in red color. + """ + termcolor.cprint(message, "red") + +def jdump(data: typing.Any): + """ + Serialize a data to a JSON string. + """ + return json.dumps(data, indent = 4, ensure_ascii = False) diff --git a/src/chad/utils/grep.py b/src/chad/utils/grep.py new file mode 100644 index 0000000..cfc57f8 --- /dev/null +++ b/src/chad/utils/grep.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 + +import regex as re + +__FLAGS = re.MULTILINE | re.IGNORECASE + +def has_site(text: str): + """ + Check if there are any matches in a text using the '(? list[str]: + """ + Remove all blacklisted values from a list using the specified RegEx pattern. + """ + tmp = [] + for entry in array: + if not re.search(blacklist, entry, flags = __FLAGS): + tmp.append(entry) + return tmp diff --git a/src/chad/utils/proxy.py b/src/chad/utils/proxy.py new file mode 100644 index 0000000..4897cd2 --- /dev/null +++ b/src/chad/utils/proxy.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 + +class Proxies: + + def __init__(self, proxies: list[str]): + """ + Class for rotating proxies in a round robin fashion. + """ + self.__proxies = proxies + self.__proxies_len = len(self.__proxies) + self.__index = 0 + self.__round_robin = False + + def is_empty(self): + """ + Returns 'True' if there are no proxies. + """ + return not self.__proxies + + def is_round_robin(self): + """ + This should be checked on each iteration.\n + If a full round has been completed, return 'True' and reset the round robin flag; otherwise, return 'False'. + """ + current = self.__round_robin + if current: + self.__round_robin = False + return current + + def get(self): + """ + Get a proxy in a round robin fashion.\n + If a full round has been completed, set the round robin flag.\n + Returns an empty string if there are no proxies. + """ + proxy = "" + if self.__proxies: + proxy = self.__proxies[self.__index] + self.__index = (self.__index + 1) % self.__proxies_len + if self.__index == 0: + self.__round_robin = True + return proxy + + def remove(self, proxy: str): + """ + Remove a proxy.\n + Returns an empty message if the proxy does not exist. + """ + message = "" + if proxy in self.__proxies: + self.__proxies.pop(self.__proxies.index(proxy)) + self.__proxies_len -= 1 + message = f"Removing '{proxy}' due to an error or rate limiting | Proxies left: {self.__proxies_len}" + self.__index = max(0, self.__index - 1) + return message diff --git a/src/chad/utils/validate.py b/src/chad/utils/validate.py new file mode 100644 index 0000000..d7d2890 --- /dev/null +++ b/src/chad/utils/validate.py @@ -0,0 +1,277 @@ +#!/usr/bin/env python3 + +from . import config, directory, file, general + +import argparse, nagooglesearch, sys + +class MyArgParser(argparse.ArgumentParser): + + def print_help(self): + print(f"Chad {config.APP_VERSION} ( github.com/ivan-sincek/chad )") + print("") + print("Usage: chad -q queries [-s site ] [-x proxies ] [-o out ]") + print("Example: chad -q queries.txt [-s *.example.com] [-x proxies.txt] [-o results.json]") + print("") + print("DESCRIPTION") + print(" Search Google Dorks like Chad") + print("QUERIES") + print(" File containing Google Dorks or a single query to use") + print(" -q, --queries = queries.txt | intext:password | \"ext:tar OR ext:zip\" | etc.") + print("SITE") + print(" Domain[s] to search") + print(" -s, --site = example.com | sub.example.com | *.example.com | \"*.example.com -www\" | etc.") + print("TIME") + print(" Get results not older than the specified time in months") + print(" -t, --time = 6 | 12 | 24 | etc.") + print("TOTAL RESULTS") + print(" Total number of unique results") + print(" Default: 100") + print(" -tr, --total-results = 200 | etc.") + print("PAGE RESULTS") + print(" Number of results per page - capped at 100 by Google") + print(" Default: randint(70, 100)") + print(" -pr, --page-results = 50 | etc.") + print("MINIMUM QUERIES") + print(" Minimum sleep time in seconds between Google queries") + print(" Default: 75") + print(" -min-q, --minimum-queries = 120 | etc.") + print("MAXIMUM QUERIES") + print(" Maximum sleep time between Google queries") + print(" Default: minimum + 50") + print(" -max-q, --maximum-queries = 180 | etc.") + print("MINIMUM PAGES") + print(" Minimum sleep time between Google pages") + print(" Default: 15") + print(" -min-p, --minimum-pages = 30 | etc.") + print("MAXIMUM PAGES") + print(" Maximum sleep time between Google pages") + print(" Default: minimum + 10") + print(" -max-p, --maximum-pages = 60 | etc.") + print("USER AGENTS") + print(" User agents to use") + print(" Default: random-all") + print(" -a, --user-agents = user_agents.txt | random(-all) | curl/3.30.1 | etc.") + print("PROXIES") + print(" File containing web proxies or a single web proxy to use") + print(" -x, --proxies = proxies.txt | http://127.0.0.1:8080 | etc.") + print("DIRECTORY") + print(" Downloads directory") + print(" All downloaded files will be saved in this directory") + print(" -dir, --directory = downloads | etc.") + print("THREADS") + print(" Number of files to download in parallel") + print(" Default: 5") + print(" -th, --threads = 20 | etc.") + print("OUT") + print(" Output file") + print(" -o, --out = results.json | etc.") + print("NO SLEEP ON START") + print(" Disable the safety feature to prevent triggering rate limits by accident") + print(" -nsos, --no-sleep-on-start") + print("DEBUG") + print(" Enable debug output") + print(" -dbg, --debug") + + def error(self, message): + if len(sys.argv) > 1: + print("Missing a mandatory option (-q) and/or optional (-s, -t, -tr, -pr, -min-q, -max-q, -min-p, -max-p, -a, -x, -dir, -th, -o, -nsos, -dbg)") + print("Use -h or --help for more info") + else: + self.print_help() + exit() + +class Validate: + + def __init__(self): + """ + Class for validating and managing CLI arguments. + """ + self.__parser = MyArgParser() + self.__parser.add_argument("-q" , "--queries" , required = True , type = str , default = "" ) + self.__parser.add_argument("-s" , "--site" , required = False, type = str , default = "" ) + self.__parser.add_argument("-t" , "--time" , required = False, type = str , default = "" ) + self.__parser.add_argument("-tr" , "--total-results" , required = False, type = str , default = "" ) + self.__parser.add_argument("-pr" , "--page-results" , required = False, type = str , default = "" ) + self.__parser.add_argument("-min-q", "--minimum-queries" , required = False, type = str , default = "" ) + self.__parser.add_argument("-max-q", "--maximum-queries" , required = False, type = str , default = "" ) + self.__parser.add_argument("-min-p", "--minimum-pages" , required = False, type = str , default = "" ) + self.__parser.add_argument("-max-p", "--maximum-pages" , required = False, type = str , default = "" ) + self.__parser.add_argument("-a" , "--user-agents" , required = False, type = str , default = "" ) + self.__parser.add_argument("-x" , "--proxies" , required = False, type = str , default = "" ) + self.__parser.add_argument("-dir" , "--directory" , required = False, type = str , default = "" ) + self.__parser.add_argument("-th" , "--threads" , required = False, type = str , default = "" ) + self.__parser.add_argument("-o" , "--out" , required = False, type = str , default = "" ) + self.__parser.add_argument("-nsos" , "--no-sleep-on-start", required = False, action = "store_true", default = False) + self.__parser.add_argument("-dbg" , "--debug" , required = False, action = "store_true", default = False) + + def validate_args(self): + """ + Validate and return the CLI arguments. + """ + self.__success = True + self.__args = self.__parser.parse_args() + self.__validate_queries() + self.__validate_time() + self.__validate_total_results() + self.__validate_page_results() + self.__validate_minimum_queries() + self.__validate_maximum_queries() + self.__validate_minimum_pages() + self.__validate_maximum_pages() + self.__validate_user_agents() + self.__validate_proxies() + self.__validate_directory() + self.__validate_threads() + return self.__success, self.__args + + def __error(self, message: str): + """ + Set the success flag to 'False' to prevent the main task from executing, and print an error message. + """ + self.__success = False + general.print_error(message) + + # ------------------------------------ + + def __validate_queries(self): + tmp = [] + if file.is_file(self.__args.queries): + success, message = file.validate(self.__args.queries) + if not success: + self.__error(message) + else: + tmp = file.read_array(self.__args.queries) + if not tmp: + self.__error(f"No Google Dorks were found in \"{self.__args.queries}\"") + else: + tmp = [self.__args.queries] + self.__args.queries = tmp + + def __validate_time(self): + tmp = 0 + if self.__args.time: + if not self.__args.time.isdigit(): + self.__error("Number of months must be numeric") + else: + tmp = int(self.__args.time) + if tmp <= 0: + self.__error("Number of months must be greater than zero") + self.__args.time = tmp + + def __validate_total_results(self): + tmp = 100 + if self.__args.total_results: + if not self.__args.total_results.isdigit(): + self.__error("Total number of unique results must be numeric") + else: + tmp = int(self.__args.total_results) + if tmp <= 0: + self.__error("Total number of unique results must be greater than zero") + self.__args.total_results = tmp + + def __validate_page_results(self): + tmp = 0 + if self.__args.page_results: + if not self.__args.page_results.isdigit(): + self.__error("Number of results per page must be numeric") + else: + tmp = int(self.__args.page_results) + if tmp < 1 or tmp > 100: + self.__error("Number of results per page must be between 1 and 100") + self.__args.page_results = tmp + + def __validate_minimum_queries(self): + tmp = 75 + if self.__args.minimum_queries: + if not self.__args.minimum_queries.isdigit(): + self.__error("Minimum sleep time between Google queries must be numeric") + else: + tmp = int(self.__args.minimum_queries) + if tmp <= 0: + self.__error("Minimum sleep time between Google queries must be greater than zero") + self.__args.minimum_queries = tmp + + def __validate_maximum_queries(self): + tmp = self.__args.minimum_queries + 50 + if self.__args.maximum_queries: + if not self.__args.maximum_queries.isdigit(): + self.__error("Maximum sleep time between Google queries must be numeric") + else: + tmp = int(self.__args.maximum_queries) + if tmp <= 0: + self.__error("Maximum sleep time between Google queries must be greater than zero") + self.__args.maximum_queries = tmp + + def __validate_minimum_pages(self): + tmp = 15 + if self.__args.minimum_pages: + if not self.__args.minimum_pages.isdigit(): + self.__error("Minimum sleep time between Google pages must be numeric") + else: + tmp = int(self.__args.minimum_pages) + if tmp <= 0: + self.__error("Minimum sleep time between Google pages must be greater than zero") + self.__args.minimum_pages = tmp + + def __validate_maximum_pages(self): + tmp = self.__args.minimum_pages + 10 + if self.__args.maximum_pages: + if not self.__args.maximum_pages.isdigit(): + self.__error("Maximum sleep time between Google pages must be numeric") + else: + tmp = int(self.__args.maximum_pages) + if tmp <= 0: + self.__error("Maximum sleep time between Google pages must be greater than zero") + self.__args.maximum_pages = tmp + + def __validate_user_agents(self): + tmp = nagooglesearch.get_all_user_agents() + if self.__args.user_agents: + if file.is_file(self.__args.user_agents): + success, message = file.validate(self.__args.user_agents) + if not success: + self.__error(message) + else: + tmp = file.read_array(self.__args.user_agents) + if not tmp: + self.__error(f"No user agents were found in \"{self.__args.user_agents}\"") + else: + lower = self.__args.user_agents.lower() + if lower == "random-all": + pass + elif lower == "random": + tmp = [nagooglesearch.get_random_user_agent()] + else: + tmp = [self.__args.user_agents] + self.__args.user_agents = tmp + + def __validate_proxies(self): + tmp = [] + if self.__args.proxies: + if file.is_file(self.__args.proxies): + success, message = file.validate(self.__args.proxies) + if not success: + self.__error(message) + else: + tmp = file.read_array(self.__args.proxies) + if not tmp: + self.__error(f"No web proxies were found in \"{self.__args.proxies}\"") + else: + tmp = [self.__args.proxies] + self.__args.proxies = tmp + + def __validate_directory(self): + if self.__args.directory: + if not directory.is_directory(self.__args.directory): + self.__error(f"\"{self.__args.directory}\" does not exist or is not a directory") + + def __validate_threads(self): + tmp = 5 + if self.__args.threads: + if not self.__args.threads.isdigit(): + self.__error("Number of files to download in parallel must be numeric") + else: + tmp = int(self.__args.threads) + if tmp <= 0: + self.__error("Number of files to download in parallel must be greater than zero") + self.__args.threads = tmp diff --git a/src/chad_extractor/main.py b/src/chad_extractor/main.py new file mode 100644 index 0000000..23523b0 --- /dev/null +++ b/src/chad_extractor/main.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 + +from .utils import config, extractor, report, result, storage, validate + +import datetime + +# ---------------------------------------- + +class Stopwatch: + + def __init__(self): + self.__format = "%Y-%m-%d %H:%M:%S" + self.__start = datetime.datetime.now() + + def stop(self): + self.__end = datetime.datetime.now() + print(f"Script has finished in {self.__end - self.__start}") + + def get_start(self): + return self.__start.strftime(self.__format) + + def get_end(self): + return self.__end.strftime(self.__format) + +stopwatch = Stopwatch() + +# ---------------------------------------- + +def main(): + success, args = validate.Validate().validate_args() + if success: + config.banner() + results = None + storage.MyManager.register("Shared", storage.Shared) + with storage.MyManager() as manager: + shared_storage: storage.Shared = manager.Shared( + args.template, + args.results, + args.plaintext, + args.excludes, + args.debug + ) + tool = extractor.ChadExtractor( + shared_storage, + args.playwright, + args.playwright_wait, + args.concurrent_requests, + args.concurrent_requests_domain, + args.sleep, + args.random_sleep, + args.auto_throttle, + args.retries, + args.request_timeout, + args.user_agents, + args.proxy, + args.debug + ) + if not shared_storage.parse_template(): + print("No extraction details were found in the template") + elif not shared_storage.parse_input(): + print("No data was extracted" if args.plaintext else "No Chad results are suitable for extraction") + elif not args.plaintext and not tool.run(): + print("No data was extracted") + else: + shared_storage.start_validation() + if not shared_storage.parse_template(): + print("No validation details were found in the template") + elif not shared_storage.parse_input(): + print("No extracted data is suitable for validation") + elif not tool.run(): + print("No extracted data matched the validation criteria") + results = shared_storage.get_results() + stopwatch.stop() + if results.results[result.Stage.EXTRACTION].success: + report.save( + results, + stopwatch.get_start(), + stopwatch.get_end(), + args.out, + args.verbose, + args.plaintext + ) + +if __name__ == "__main__": + main() diff --git a/src/chad_extractor/utils/array.py b/src/chad_extractor/utils/array.py new file mode 100644 index 0000000..80b1b4f --- /dev/null +++ b/src/chad_extractor/utils/array.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python3 + +def unique(array: list[str], sort = False): + """ + Unique sort a list in ascending order. + """ + seen = set() + array = [x for x in array if not (x in seen or seen.add(x))] + if sort and array: + array = sorted(array, key = str.casefold) + return array diff --git a/src/chad_extractor/utils/config.py b/src/chad_extractor/utils/config.py new file mode 100644 index 0000000..b90ae23 --- /dev/null +++ b/src/chad_extractor/utils/config.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 + +APP_VERSION = "v7.0" + +REPORT_EXTENSION = ".report.json" + +def banner(): + """ + Display the banner. + """ + print("#########################################################################") + print("# #") + print("# Chad Extractor v7.0 #") + print("# #") + print("# Extract and validate data from Chad results or plaintext files. #") + print("# GitHub repository at github.com/ivan-sincek/chad. #") + print("# #") + print("#########################################################################") diff --git a/src/chad_extractor/utils/directory.py b/src/chad_extractor/utils/directory.py new file mode 100644 index 0000000..f259094 --- /dev/null +++ b/src/chad_extractor/utils/directory.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 + +from . import array, file + +import os + +def exists(path: str): + """ + Returns 'True' if a path exists. + """ + return os.path.exists(path) + +def is_directory(directory: str): + """ + Returns 'True' if the 'directory' exists and is a regular directory. + """ + return os.path.isdir(directory) + +def validate(directory: str): + """ + Validate a directory.\n + Success flag is 'True' if the directory has a read permission and is not empty. + """ + success = False + message = "" + if not os.access(directory, os.R_OK): + message = f"\"{directory}\" does not have a read permission" + elif not os.stat(directory).st_size > 0: + message = f"\"{directory}\" is empty" + else: + success = True + return success, message + +def list_files(directory: str, sort = False): + """ + Get all valid files from a directory. Non-recursive. + """ + tmp = [] + for filename in os.listdir(directory): + path = os.path.join(directory, filename) + if file.validate_silent(path): + tmp.append(path) + return array.unique(tmp, sort) diff --git a/src/chad_extractor/utils/extractor.py b/src/chad_extractor/utils/extractor.py new file mode 100644 index 0000000..e64544e --- /dev/null +++ b/src/chad_extractor/utils/extractor.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python3 + +from . import general, input, result, storage, url + +from playwright.async_api import TimeoutError as PlaywrightTimeoutError +from playwright._impl._errors import Error as PlaywrightError + +import asyncio, multiprocessing, random, scrapy, scrapy.crawler, scrapy.utils.project, typing + +# ---------------------------------------- + +class ChadExtractorSpider(scrapy.Spider): + + def __init__( + self, + shared_storage : storage.Shared, + playwright : bool, + playwright_wait: float, + request_timeout: float, + user_agents : list[str], + proxy : str, + debug : bool + ): + """ + Class for managing Scrapy's spider. + """ + self.name = "ChadExtractorSpider" + self.handle_httpstatus_list = [401, 403, 404] + self.__shared_storage = shared_storage + self.__validation_started = self.__shared_storage.is_validation_started() + self.__playwright = playwright + self.__playwright_wait = playwright_wait + self.__request_timeout = request_timeout + self.__user_agents = user_agents + self.__user_agents_len = len(self.__user_agents) + self.__proxy = proxy + self.__debug = debug + self.__context = 0 + + def start_requests(self): + """ + Main method. + """ + input = self.__shared_storage.get_input() + print(general.get_timestamp(f"Number of URLs to {'validate' if self.__validation_started else 'extract'}: {len(input)}")) + print("Press CTRL + C to exit early - results will be saved, be patient") + random.shuffle(input) + for entry in input: + yield scrapy.Request( + url = entry.url, + headers = self.__get_default_headers() | self.__shared_storage.get_headers(entry.key, with_cookies = False), + cookies = self.__shared_storage.get_cookies(entry.key), + meta = self.__get_metadata(entry), + errback = self.__error, + callback = self.__success, + dont_filter = False + ) + + def __get_default_headers(self) -> dict[str, str]: + """ + Get default HTTP request headers. + """ + default_headers = { + "User-Agent" : self.__get_user_agent(), + "Accept-Language" : "en-US, *", + "Accept" : "*/*", + "Referer" : "https://www.google.com/", + "Upgrade-Insecure-Requests": "1" + } + headers = {} + for name, value in default_headers.items(): + if value: + headers[name.lower()] = value + return headers + + def __get_user_agent(self): + """ + Get a [random] user agent.\n + Returns an empty string if there are no user agents. + """ + user_agent = "" + if self.__user_agents_len > 0: + user_agent = self.__user_agents[random.randint(0, self.__user_agents_len - 1)] + return user_agent + + def __get_metadata(self, entry: input.InputGrouped) -> dict[str, typing.Any]: + """ + Get Scrapy's request metadata. + """ + # -------------------------------- + if self.__validation_started: + self.__playwright, self.__playwright_wait = self.__shared_storage.get_playwright(entry.key) + # -------------------------------- + self.__context += 1 + tmp = {} + tmp["entry" ] = entry # custom attribute + tmp["playwright_wait" ] = self.__playwright_wait # custom attribute + tmp["playwright" ] = self.__playwright + tmp["playwright_context" ] = str(self.__context) + tmp["playwright_include_page" ] = self.__playwright + tmp["playwright_context_kwargs" ] = {} + tmp["playwright_context_kwargs" ]["ignore_https_errors"] = True + tmp["playwright_context_kwargs" ]["java_script_enabled"] = True + tmp["playwright_context_kwargs" ]["accept_downloads" ] = False + tmp["playwright_context_kwargs" ]["bypass_csp" ] = False + tmp["playwright_page_goto_kwargs"] = {"wait_until": "load"} + tmp["proxy" ] = self.__proxy + tmp["cookiejar" ] = self.__context + tmp["dont_merge_cookies" ] = False + return tmp + + # ------------------------------------ + + async def __error(self, failure: typing.Any): + """ + HTTP request error callback. + """ + entry = failure.request.meta["entry" ] + playwright = failure.request.meta["playwright"] + status = failure.value.response.status if failure.check(scrapy.spidermiddlewares.httperror.HttpError) else 0 + error = str(failure.value).splitlines()[0] + if playwright: + page = failure.request.meta["playwright_page"] + if any(err in error for err in ["net::ERR_ABORTED", "net::ERR_CONNECTION_RESET"]): + self.__print_fallback(playwright, status, entry.url) + content, status, error = await self.__playwright_fallback(page, entry) + await page.close() + await page.context.close() + if error: + self.__append_error(entry, playwright, status, error) + else: + self.__append_success(entry, playwright, status, content) + + async def __playwright_fallback(self, page: typing.Any, entry: input.InputGrouped) -> tuple[str, int, str]: + """ + Fallback from 'Page.goto()' to 'APIRequestContext.get()'. + """ + content = "" + status = 0 + error = "" + response = None + try: + response = await page.request.get( + url = entry.url, + headers = self.__get_default_headers() | self.__shared_storage.get_headers(entry.key, with_cookies = True), + ignore_https_errors = True, + timeout = self.__request_timeout * 1000, + max_retries = 0, + max_redirects = 10 + ) + status = response.status + content, error = general.decode(await response.body()) + except (PlaywrightError, PlaywrightTimeoutError) as ex: + error = str(ex).splitlines()[0] + finally: + if response: + await response.dispose() + return content, status, error + + def __append_error(self, entry: input.InputGrouped, playwright: bool, status: int, error: str): + """ + Append to the error list and print an error message. + """ + res = result.Result(entry.url, entry.files) + self.__shared_storage.append_error(res) + self.__print_error(playwright, status, entry.url, error) + + # ------------------------------------ + + async def __success(self, response: typing.Any): + """ + HTTP request success callback. + """ + entry = response.request.meta["entry" ] + playwright = response.request.meta["playwright"] + content = "" + error = "" + if playwright: + page = response.request.meta["playwright_page"] + wait = response.request.meta["playwright_wait"] + if wait > 0: + await asyncio.sleep(wait) + content = await page.content() + await page.close() + await page.context.close() + elif hasattr(response, "text"): + content = response.text + else: + content, error = general.decode(response.body) + if url.normalize(entry.url) != response.url: + self.__print_redirected(playwright, response.status, entry.url, response.url) + if error: + self.__append_error(entry, playwright, response.status, error) + else: + self.__append_success(entry, playwright, response.status, content) + + def __append_success(self, entry: input.InputGrouped, playwright: bool, status: int, content: str): + """ + Append to the success list and print an error message. + """ + res = result.Result(entry.url, entry.files) + res.results = self.__shared_storage.parse_response(content, entry.key) + if res.results: + self.__shared_storage.append_success(res) + self.__print_success(playwright, status, entry.url) + else: + self.__print_success_no_results(playwright, status, entry.url) + + # ------------------------------------ + + def __print_fallback(self, playwright: bool, status: int, url: str): + """ + Print fallback. + """ + if self.__debug: + if status: + url = f"{status} {url}" + general.print_cyan(f"[ FALLBACK ] PW:{int(playwright)} | {url} -> Page.goto() to APIRequestContext.get()") + + def __print_error(self, playwright: bool, status: int, url: str, message: str): + """ + Print error. + """ + if self.__debug: + if status: + url = f"{status} {url}" + general.print_red(f"[ ERROR ] PW:{int(playwright)} | {url} -> {message}") + + def __print_redirected(self, playwright: bool, status: int, request_url: str, response_url: str): + """ + Print redirected. + """ + if self.__debug: + general.print_yellow(f"[ REDIRECTED ] PW:{int(playwright)} | {request_url} -> {status} {response_url}") + + def __print_success(self, playwright: bool, status: int, url: str): + """ + Print success. + """ + if self.__debug: + general.print_green(f"[ {'VALIDATED'if self.__validation_started else 'EXTRACTED'} ] PW:{int(playwright)} | {status} {url}") + + def __print_success_no_results(self, playwright: bool, status: int, url: str): + """ + Print success with no results. + """ + if self.__debug: + general.print_magenta(f"[ NO MATCH ] PW:{int(playwright)} | {status} {url}") + +# ---------------------------------------- + +class ChadExtractor: + + def __init__( + self, + shared_storage : storage.Shared, + playwright : bool, + playwright_wait : float, + concurrent_requests : int, + concurrent_requests_domain: int, + sleep : float, + random_sleep : bool, + auto_throttle : float, + retries : int, + request_timeout : float, + user_agents : list[str], + proxy : str, + debug : bool + ): + """ + Class for managing Scrapy's runner. + """ + self.__shared_storage = shared_storage + self.__playwright = playwright + self.__playwright_wait = playwright_wait + self.__concurrent_requests = concurrent_requests + self.__concurrent_requests_domain = concurrent_requests_domain + self.__sleep = sleep + self.__random_sleep = random_sleep + self.__auto_throttle = auto_throttle + self.__retries = retries + self.__request_timeout = request_timeout # all timeouts + self.__user_agents = user_agents + self.__proxy = proxy + self.__debug = debug + self.__headless_browser = True + self.__browser_type = "chromium" # Playwright's headless browser + self.__handle_sigint = False + + def __page_block(self, request: typing.Any): + """ + Types of content to block while using Playwright's headless browser. + """ + return request.resource_type in ["fetch", "stylesheet", "image", "ping", "font", "media", "imageset", "beacon", "csp_report", "object", "texttrack", "manifest"] + + def __run(self): + """ + Configure the settings and run the Chad Extractor spider. + """ + settings = scrapy.utils.project.get_project_settings() + # -------------------------------- + settings["COOKIES_ENABLED" ] = True + settings["DOWNLOAD_TIMEOUT" ] = self.__request_timeout # connect / read timeout + settings["DOWNLOAD_DELAY" ] = self.__sleep + settings["RANDOMIZE_DOWNLOAD_DELAY"] = self.__random_sleep + settings["HTTPPROXY_ENABLED" ] = bool(self.__proxy) + # -------------------------------- + settings["EXTENSIONS"]["scrapy.extensions.throttle.AutoThrottle"] = 100 + # -------------------------------- + settings["AUTOTHROTTLE_ENABLED" ] = self.__auto_throttle > 0 + settings["AUTOTHROTTLE_DEBUG" ] = False + settings["AUTOTHROTTLE_START_DELAY" ] = self.__sleep + settings["AUTOTHROTTLE_MAX_DELAY" ] = settings["AUTOTHROTTLE_START_DELAY"] + 30 + settings["AUTOTHROTTLE_TARGET_CONCURRENCY"] = self.__auto_throttle + # -------------------------------- + settings["CONCURRENT_REQUESTS" ] = self.__concurrent_requests + settings["CONCURRENT_REQUESTS_PER_DOMAIN"] = self.__concurrent_requests_domain + settings["RETRY_ENABLED" ] = self.__retries > 0 + settings["RETRY_TIMES" ] = self.__retries + settings["REDIRECT_ENABLED" ] = True + settings["REDIRECT_MAX_TIMES" ] = 10 + # -------------------------------- + settings["ROBOTSTXT_OBEY" ] = False + settings["TELNETCONSOLE_ENABLED" ] = False + settings["LOG_ENABLED" ] = False + settings["REQUEST_FINGERPRINTER_IMPLEMENTATION"] = "2.7" + # -------------------------------- + if self.__shared_storage.is_validation_started(): + self.__playwright, self.__playwright_wait = self.__shared_storage.require_playwright() + # -------------------------------- + if self.__playwright: + settings["DOWNLOAD_HANDLERS"]["https"] = "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" + settings["DOWNLOAD_HANDLERS"]["http" ] = "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" + settings["TWISTED_REACTOR" ] = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" + settings["PLAYWRIGHT_LAUNCH_OPTIONS" ] = { + "headless" : self.__headless_browser, + "handle_sigint": self.__handle_sigint, + "proxy" : {"server": self.__proxy} if self.__proxy else None + } + settings["PLAYWRIGHT_BROWSER_TYPE" ] = self.__browser_type + settings["PLAYWRIGHT_ABORT_REQUEST" ] = self.__page_block + settings["PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT"] = self.__request_timeout * 1000 + # -------------------------------- + crawler = scrapy.crawler.CrawlerProcess(settings) + crawler.crawl(ChadExtractorSpider, self.__shared_storage, self.__playwright, self.__playwright_wait, self.__request_timeout, self.__user_agents, self.__proxy, self.__debug); crawler.start(); crawler.join() + + def run(self): + """ + Run Scrapy's spider. + """ + process = multiprocessing.Process(target = self.__run) + try: + process.start() + process.join() + except KeyboardInterrupt: + process.terminate() + process.join() + return self.__shared_storage.has_success() diff --git a/src/chad_extractor/utils/file.py b/src/chad_extractor/utils/file.py new file mode 100644 index 0000000..79035b2 --- /dev/null +++ b/src/chad_extractor/utils/file.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 + +from . import array + +import os + +__ENCODING = "ISO-8859-1" + +def is_file(file: str): + """ + Returns 'True' if the 'file' exists and is a regular file. + """ + return os.path.isfile(file) + +def validate(file: str): + """ + Validate a file.\n + Success flag is 'True' if the file has a read permission and is not empty. + """ + success = False + message = "" + if not os.access(file, os.R_OK): + message = f"\"{file}\" does not have a read permission" + elif not os.stat(file).st_size > 0: + message = f"\"{file}\" is empty" + else: + success = True + return success, message + +def validate_silent(file: str): + """ + Silently validate a file.\n + Returns 'True' if the 'file' exists, is a regular file, has a read permission, and is not empty. + """ + return os.path.isfile(file) and os.access(file, os.R_OK) and os.stat(file).st_size > 0 + +def read(file: str): + """ + Read a file as text.\n + Whitespace will be stripped from the text. + """ + return open(file, "r", encoding = __ENCODING).read().strip() + +def read_array(file: str, sort = False): + """ + Read a file line by line, and append the lines to a list.\n + Whitespace will be stripped from each line, and empty lines will be removed.\n + Returns a unique [sorted] list. + """ + tmp = [] + with open(file, "r", encoding = __ENCODING) as stream: + for line in stream: + line = line.strip() + if line: + tmp.append(line) + return array.unique(tmp, sort) + +def overwrite(text: str, out: str): + """ + Write a text to an output file.\n + If the output file exists, prompt to overwrite it. + """ + confirm = "yes" + if os.path.isfile(out): + print(f"'{out}' already exists") + confirm = input("Overwrite the output file (yes): ") + if confirm.lower() in ["yes", "y"]: + try: + open(out, "w").write(text) + print(f"Results have been saved to '{out}'") + except FileNotFoundError: + print(f"Cannot save the results to '{out}'") + +def write_silent(text: str, out: str): + """ + Silently write a text to an output file. + """ + try: + open(out, "w").write(text) + except Exception: + pass diff --git a/src/chad_extractor/utils/general.py b/src/chad_extractor/utils/general.py new file mode 100644 index 0000000..c68a592 --- /dev/null +++ b/src/chad_extractor/utils/general.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +import colorama, datetime, json, termcolor, typing + +colorama.init(autoreset = True) + +__ENCODINGS = ["ISO-8859-1", "UTF-8"] + +def decode(bytes: bytes): + """ + Returns an empty string and an error message on failure. + """ + text = "" + message = "" + for encoding in __ENCODINGS: + try: + text = bytes.decode(encoding) + message = "" + break + except Exception as ex: + message = str(ex) + return text, message + +def to_float(value: str): + """ + Returns 'None' on failure. + """ + tmp = None + try: + tmp = float(value) + except ValueError: + pass + return tmp + +def jdump(data: typing.Any): + """ + Serialize a data to a JSON string. + """ + return json.dumps(data, indent = 4, ensure_ascii = False) + +# ---------------------------------------- + +def get_timestamp(message): + """ + Get the current timestamp. + """ + return f"{datetime.datetime.now().strftime('%H:%M:%S')} - {message}" + +def print_error(message: str): + """ + Print an error message. + """ + print(f"ERROR: {message}") + +def print_cyan(message: str): + """ + Print a message in cyan color. + """ + termcolor.cprint(message, "cyan") + +def print_green(message: str): + """ + Print a message in green color. + """ + termcolor.cprint(message, "green") + +def print_yellow(message: str): + """ + Print a message in yellow color. + """ + termcolor.cprint(message, "yellow") + +def print_red(message: str): + """ + Print a message in red color. + """ + termcolor.cprint(message, "red") + +def print_magenta(message: str): + """ + Print a message in magenta color. + """ + termcolor.cprint(message, "magenta") diff --git a/src/chad_extractor/utils/grep.py b/src/chad_extractor/utils/grep.py new file mode 100644 index 0000000..ad55993 --- /dev/null +++ b/src/chad_extractor/utils/grep.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +import regex as re + +def validate(query: str): + """ + Validate a regular expression. + """ + success = False + message = "" + try: + re.compile(query) + success = True + except re.error: + message = f"Invalid RegEx: {query}" + return success, message + +def validate_multiple(queries: list[str]): + """ + Validate multiple regular expressions. + """ + success = True + message = "" + for query in queries: + success, message = validate(query) + if not success: + break + return success, message diff --git a/src/chad_extractor/utils/input.py b/src/chad_extractor/utils/input.py new file mode 100644 index 0000000..1af517f --- /dev/null +++ b/src/chad_extractor/utils/input.py @@ -0,0 +1,49 @@ + +#!/usr/bin/env python3 + +import dataclasses, json + +@dataclasses.dataclass +class Input: + """ + Class for temporarily storing an input used for extraction or validation. + """ + url : str + key : str + file: str + +@dataclasses.dataclass +class InputGrouped: + """ + Class for storing an input used for extraction or validation grouped by URL. + """ + url : str + key : str + files: list[str] + +# ---------------------------------------- + +@dataclasses.dataclass +class ChadResults: + """ + Class for storing Chad results. + """ + query: str = "" + proxy: str = "" + urls : list[str] = dataclasses.field(default_factory = list) + +def deserialize_chad_results(chad_results_json: str) -> tuple[list[ChadResults] | None, str]: + """ + Deserialize Chad results from a JSON string.\n + Returns 'None' and an error message on failure. + """ + results = None + message = "" + try: + tmp = json.loads(chad_results_json) + for i in range(len(tmp)): + tmp[i] = ChadResults(**tmp[i]) + results = tmp + except Exception: + message = "Cannot deserialize Chad results" + return results, message diff --git a/src/chad_extractor/utils/jquery.py b/src/chad_extractor/utils/jquery.py new file mode 100644 index 0000000..8cd680f --- /dev/null +++ b/src/chad_extractor/utils/jquery.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 + +from . import array, input, result + +import collections + +def select_urls(obj: list[input.ChadResults], sort = False): + """ + Get all 'ChadResults.urls'.\n + Returns a unique [sorted] list. + """ + tmp = [] + for entry in obj: + tmp.extend(entry.urls) + return array.unique(tmp, sort) + +def group_by_url(obj: list[input.Input]) -> list[input.InputGrouped]: + """ + Group the input by 'Input.url'. + """ + grouped = collections.defaultdict(lambda: input.InputGrouped("", "", [])) + for entry in obj: + grouped[entry.url].url = entry.url + grouped[entry.url].key = entry.key + grouped[entry.url].files.append(entry.file) + tmp = [] + for entry in list(grouped.values()): + entry.files = array.unique(entry.files, sort = True) + tmp.append(entry) + return tmp + +# ---------------------------------------- + +def select_results(obj: list[result.Result | result.ResultPlaintext], sort = False): + """ + Get all 'Result.results[key]' or 'ResultPlaintext.results[key]'.\n + Returns a unique [sorted] list. + """ + tmp = [] + for entry in obj: + for key in entry.results: + tmp.extend(entry.results[key]) + return array.unique(tmp, sort) + +# ---------------------------------------- + +def sort_by_url(obj: list[result.Result]): + """ + Sort the results by 'Result.url'. + """ + return sorted(obj, key = lambda entry: entry.url.casefold()) + +def select_url(obj: list[result.Result], sort = False): + """ + Get all 'Result.url'.\n + Returns a unique [sorted] list. + """ + tmp = [] + for entry in obj: + tmp.append(entry.url) + return array.unique(tmp, sort) + +# ---------------------------------------- + +def sort_by_file(obj: list[result.ResultPlaintext]): + """ + Sort the results by 'ResultPlaintext.file'. + """ + return sorted(obj, key = lambda entry: entry.file.casefold()) + +def select_file(obj: list[result.ResultPlaintext], sort = False): + """ + Get all 'ResultPlaintext.file'.\n + Returns a unique [sorted] list. + """ + tmp = [] + for entry in obj: + tmp.append(entry.file) + return array.unique(tmp, sort) + +def select_files(obj: list[result.Result], sort = False): + """ + Get all 'Result.files'.\n + Returns a unique [sorted] list. + """ + tmp = [] + for entry in obj: + tmp.extend(entry.files) + return array.unique(tmp, sort) + +# ---------------------------------------- + +def select_by_file(obj: list[result.ResultPlaintext], file: str) -> list[result.ResultPlaintext]: + """ + Get all 'Result' for the specified file. + """ + tmp = [] + for entry in obj: + if file == entry.file: + tmp.append(entry) + break + return tmp + +def select_by_files(obj: list[result.Result], file: str) -> list[result.Result]: + """ + Get all 'Result' for the specified file. + """ + tmp = [] + for entry in obj: + if file in entry.files: + tmp.append(entry) + return tmp + +def select_url_by_file(obj: list[result.Result], file: str, sort = False): + """ + Get all 'Result' for the specified file.\n + Returns a unique [sorted] list. + """ + tmp = [] + for entry in obj: + if file in entry.files: + tmp.append(entry.url) + return array.unique(tmp, sort) diff --git a/src/chad_extractor/utils/report.py b/src/chad_extractor/utils/report.py new file mode 100644 index 0000000..229f08e --- /dev/null +++ b/src/chad_extractor/utils/report.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 + +from . import config, file, general, jquery, result + +import dataclasses + +def save(results: result.Results, started_at: str, ended_at: str, out: str, verbose: bool, plaintext: bool): + """ + """ + # ------------------------------------ + tmp = Report(started_at, ended_at) + # ------------------------------------ + extracted = results.results[result.Stage.EXTRACTION].success + extracted_error = results.results[result.Stage.EXTRACTION].error + validated = results.results[result.Stage.VALIDATION].success + validated_error = results.results[result.Stage.VALIDATION].error + # ------------------------------------ + validated = jquery.sort_by_url(validated) + tmp.summary.validated = jquery.select_url(validated) + # ------------------------------------ + extracted = jquery.sort_by_file(extracted) if plaintext else jquery.sort_by_url(extracted) + tmp.full = extracted + tmp.summary.extracted = jquery.select_results(tmp.full, sort = True) + # ------------------------------------ + validated_error = jquery.sort_by_url(validated_error) + tmp.failed.validation = jquery.select_url(validated_error) + # ------------------------------------ + extracted_error = jquery.sort_by_url(extracted_error) + tmp.failed.extraction = jquery.select_url(extracted_error) + # ------------------------------------ + file.overwrite(get_primary(tmp, plaintext), out) + # ------------------------------------ + if verbose: + for path in jquery.select_file(extracted) if plaintext else jquery.select_files(extracted): + # ---------------------------- + tmp = Report(started_at, ended_at) + # ---------------------------- + tmp.summary.validated = jquery.select_url_by_file(validated, path) + # ---------------------------- + tmp.full = jquery.select_by_file(extracted, path) if plaintext else jquery.select_by_files(extracted, path) + tmp.summary.extracted = jquery.select_results(tmp.full, sort = True) + # ---------------------------- + tmp.failed.validation = jquery.select_url_by_file(validated_error, path) + # ---------------------------- + tmp.failed.extraction = jquery.select_url_by_file(extracted_error, path) + # ---------------------------- + file.write_silent(get_secondary(tmp, plaintext), path.rsplit(".", 1)[0] + config.REPORT_EXTENSION) + +@dataclasses.dataclass +class ReportSummary: + """ + """ + validated: list[str] = dataclasses.field(default_factory = list) + extracted: list[str] = dataclasses.field(default_factory = list) + +@dataclasses.dataclass +class ReportFailed: + """ + """ + validation: list[str] = dataclasses.field(default_factory = list) + extraction: list[str] = dataclasses.field(default_factory = list) + +@dataclasses.dataclass +class Report: + """ + """ + started_at: str = "" + ended_at : str = "" + summary : ReportSummary = dataclasses.field(default_factory = ReportSummary) + failed : ReportFailed = dataclasses.field(default_factory = ReportFailed) + full : list = dataclasses.field(default_factory = list) + +def get_primary(report: Report, plaintext: bool): + """ + """ + tmp = dataclasses.asdict(report) + if not plaintext: + for entry in tmp.get("full", []): + del entry["files"] + else: + del tmp["failed"]["extraction"] + return general.jdump(tmp) + +def get_secondary(report: Report, plaintext: bool): + """ + """ + tmp = dataclasses.asdict(report) + if not plaintext: + for entry in tmp.get("full", []): + del entry["files"] + else: + del tmp["failed"]["extraction"] + tmp["results"] = tmp["full"][0]["results"] + del tmp["full"] + return general.jdump(tmp) diff --git a/src/chad_extractor/utils/result.py b/src/chad_extractor/utils/result.py new file mode 100644 index 0000000..e9ef4a0 --- /dev/null +++ b/src/chad_extractor/utils/result.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 + +import dataclasses, enum + +class Stage(enum.Enum): + """ + Enum containing stages. + """ + EXTRACTION = "extraction" + VALIDATION = "validation" + +@dataclasses.dataclass +class Result: + """ + Class for storing a result. + """ + url : str + files : list[str] + results: dict[str, list[str]] = dataclasses.field(default_factory = dict) + +@dataclasses.dataclass +class ResultPlaintext: + """ + Class for storing a plaintext result. + """ + file : str + results: dict[str, list[str]] = dataclasses.field(default_factory = dict) + +@dataclasses.dataclass +class StageResults: + """ + Class for storing results of a single stage. + """ + success: list[Result | ResultPlaintext] = dataclasses.field(default_factory = list) + error : list[Result | ResultPlaintext] = dataclasses.field(default_factory = list) + +@dataclasses.dataclass +class Results: + """ + Class for storing results of multiple stages. + """ + results: dict[Stage, StageResults] = dataclasses.field(default_factory = dict) + + def __post_init__(self): + self.results[Stage.EXTRACTION] = StageResults() + self.results[Stage.VALIDATION] = StageResults() diff --git a/src/chad_extractor/utils/storage.py b/src/chad_extractor/utils/storage.py new file mode 100644 index 0000000..9f58186 --- /dev/null +++ b/src/chad_extractor/utils/storage.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 + +from . import array, file, general, input, jquery, result, template + +import multiprocessing.managers, regex as re + +class MyManager(multiprocessing.managers.BaseManager): + pass + +class Shared: + + def __init__( + self, + template : template.Template, + input : list[str], + plaintext : bool, + excludes : list[str], + debug : bool + ): + """ + Class for managing a shared storage in multiprocessing. + """ + self.__template = template + self.__input = input + self.__plaintext = plaintext + self.__excludes = excludes + self.__debug = debug + self.__flags = re.MULTILINE | re.IGNORECASE + self.__stage = result.Stage.EXTRACTION + self.__results = result.Results() + + def start_validation(self): + """ + Start validation. + """ + self.__stage = result.Stage.VALIDATION + + def is_validation_started(self): + """ + Check if validation has started. + """ + return self.__stage == result.Stage.VALIDATION + + def get_input(self) -> list[input.InputGrouped]: + """ + Get the input used for extraction or validation. + """ + return self.__input + + def has_input(self): + """ + Check if there is any input to be used for extraction or validation. + """ + return bool(self.get_input()) + + def append_error(self, result: result.Result | result.ResultPlaintext): + """ + Append a result to the error list. + """ + self.__results.results[self.__stage].error.append(result) + + def get_error(self): + """ + Get the error list. + """ + return self.__results.results[self.__stage].error + + def has_error(self): + """ + Check if there is any result in the error list. + """ + return bool(self.get_error()) + + def append_success(self, result: result.Result | result.ResultPlaintext): + """ + Append a result to the success list. + """ + self.__results.results[self.__stage].success.append(result) + + def get_success(self): + """ + Get the success list. + """ + return self.__results.results[self.__stage].success + + def has_success(self): + """ + Check if there is any result in the success list. + """ + return bool(self.get_success()) + + def get_results(self): + """ + Get all results. + """ + return self.__results + + def require_playwright(self): + """ + Check if Playwright's headless browser is required and set the browser wait time to zero.\n + Applies only for validation.\n + Returns 'True' if required and '0'. + """ + playwright, playwright_wait = False, 0 + for value in self.__template.entries.values(): + if value.validate_browser: + playwright = True + break + return playwright, playwright_wait + + def get_playwright(self, key: str): + """ + Check if Playwright's headless browser is required and get the browser wait time for the specified key.\n + Applies only for validation.\n + Returns 'True' if required and the browser wait time. + """ + return self.__template.entries[key].validate_browser, self.__template.entries[key].validate_browser_wait + + def parse_template(self): + """ + During extraction, remove all template entries without the 'extract' RegEx.\n + During validation, remove all template entries without the 'validate' RegEx.\n + Returns 'False' if no entries are left. + """ + for key in list(self.__template.entries.keys()): + if (self.__stage == result.Stage.EXTRACTION and not self.__template.entries[key].extract) or (self.__stage == result.Stage.VALIDATION and not self.__template.entries[key].validate): + self.__template.entries.pop(key) + return bool(self.__template.entries) + + def parse_input(self): + """ + Parse the input used for extraction or validation. + """ + tmp = [] + if not self.is_validation_started(): + if not self.__plaintext: + for path in self.__input: + chad_results, message = input.deserialize_chad_results(file.read(path)) + if message: + if self.__debug: + general.print_red(f"{message} from \"{path}\"") + else: + for url in jquery.select_urls(chad_results): + tmp.append(input.Input(url, "", path)) + else: + for path in self.__input: + results = self.parse_response(content = file.read(path)) # plaintext files are treated like server responses + if results: + self.append_success(result.ResultPlaintext(path, results)) + return self.has_success() + else: + if not self.__plaintext: + for entry in self.__results.results[result.Stage.EXTRACTION].success: # extracted data + for key in entry.results: + if key in self.__template.entries: + for url in entry.results[key]: + for path in entry.files: + tmp.append(input.Input(url, key, path)) + else: + for entry in self.__results.results[result.Stage.EXTRACTION].success: # extracted data + for key in entry.results: + if key in self.__template.entries: + for url in entry.results[key]: + tmp.append(input.Input(url, key, entry.file)) + self.__input = jquery.group_by_url(tmp) + return self.has_input() + + def parse_response(self, content: str, key = "") -> dict[str, list[str]] | bool: + """ + Parse an HTTP response content as a result of extraction or validation. + """ + tmp = {} + try: + if not self.is_validation_started(): + if self.__excludes: + for query in self.__excludes: + content = re.sub(query, "", content, flags = self.__flags) + for key, value in self.__template.entries.items(): + matches = re.findall(value.extract, content, flags = self.__flags) + if matches: + if value.extract_prepend or value.extract_append: + for i in range(len(matches)): + matches[i] = value.extract_prepend + matches[i] + value.extract_append + tmp[key] = array.unique(matches, sort = True) + elif re.search(self.__template.entries[key].validate, content, flags = self.__flags): + tmp = True + except (re.error, KeyError) as ex: + if self.__debug: + general.print_red(str(ex)) + return tmp + + def get_headers(self, key = "", with_cookies = False) -> dict[str, str]: + """ + Get validation HTTP request headers for the specified key.\n + Returns an empty dictionary if the specified key does not exist or is empty. + """ + headers = {} + if key and key in self.__template.entries: + for name, value in self.__template.entries[key].validate_headers.items(): + headers[name.lower()] = value + if with_cookies: + cookies = self.get_cookies(key) + if cookies: + headers["cookie"] = ("; ").join(f"{name}={value}" for name, value in cookies.items()) # for APIRequestContext.get() + return headers + + def get_cookies(self, key = "") -> dict[str, str]: + """ + Get validation HTTP cookies for the specified key.\n + Returns an empty dictionary if the specified key does not exist or is empty. + """ + cookies = {} + if key and key in self.__template.entries: + for name, value in self.__template.entries[key].validate_cookies.items(): + cookies[name.lower()] = value + return cookies diff --git a/src/chad_extractor/utils/template.py b/src/chad_extractor/utils/template.py new file mode 100644 index 0000000..9486ce1 --- /dev/null +++ b/src/chad_extractor/utils/template.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 + +import dataclasses, json + +@dataclasses.dataclass +class TemplateEntry: + """ + Class for storing a single entry of Chad Extractor template. + """ + extract : str + extract_prepend : str = "" + extract_append : str = "" + validate : str = "" + validate_browser : bool = False + validate_browser_wait: float = 0 + validate_headers : dict[str, str] = dataclasses.field(default_factory = dict) + validate_cookies : dict[str, str] = dataclasses.field(default_factory = dict) + +@dataclasses.dataclass +class Template: + """ + Class for storing a Chad Extractor template. + """ + entries: dict[str, TemplateEntry] = dataclasses.field(default_factory = dict) + +# ---------------------------------------- + +def deserialize(template_json: str) -> tuple[Template | None, str]: + """ + Deserialize a Chad Extractor template from a JSON string.\n + Returns 'None' and an error message on failure. + """ + template = Template() + message = "" + try: + tmp = json.loads(template_json) + for key in tmp.keys(): + template.entries[key] = TemplateEntry(**tmp[key]) + except Exception: + template = None + message = "Cannot deserialize the template" + return template, message diff --git a/src/chad_extractor/utils/url.py b/src/chad_extractor/utils/url.py new file mode 100644 index 0000000..32730bd --- /dev/null +++ b/src/chad_extractor/utils/url.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 + +import urllib.parse + +__URL_SCHEME_WHITELIST = ["http", "https", "socks4", "socks4h", "socks5", "socks5h"] +__MIN_PORT_NUM = 1 +__MAX_PORT_NUM = 65535 + +def validate(url: str): + """ + Validate a URL. + """ + success = False + message = "" + tmp = urllib.parse.urlsplit(url) + if not tmp.scheme: + message = f"URL scheme is required: {url}" + elif tmp.scheme not in __URL_SCHEME_WHITELIST: + message = f"Supported URL schemes are 'http[s]', 'socks4[h]', and 'socks5[h]': {url}" + elif not tmp.netloc: + message = f"Invalid domain name: {url}" + elif tmp.port and (tmp.port < __MIN_PORT_NUM or tmp.port > __MAX_PORT_NUM): + message = f"Port number is out of range: {url}" + else: + success = True + return success, message + +def normalize(url: str): + """ + Normalize a URL. + """ + return urllib.parse.urlsplit(url).geturl() diff --git a/src/chad_extractor/utils/validate.py b/src/chad_extractor/utils/validate.py new file mode 100644 index 0000000..ac985f6 --- /dev/null +++ b/src/chad_extractor/utils/validate.py @@ -0,0 +1,306 @@ +#!/usr/bin/env python3 + +from . import config, directory, file, general, grep, template, url + +import argparse, nagooglesearch, sys + +class MyArgParser(argparse.ArgumentParser): + + def print_help(self): + print(f"Chad Extractor {config.APP_VERSION} ( github.com/ivan-sincek/chad )") + print("") + print("Usage: chad-extractor -t template -res results -o out [-s sleep] [-rs random-sleep]") + print("Example: chad-extractor -t template.json -res chad_results -o report.json [-s 1.5 ] [-rs ]") + print("") + print("DESCRIPTION") + print(" Extract and validate data from Chad results or plaintext files") + print("TEMPLATE") + print(" File containing extraction and validation details") + print(" -t, --template = template.json | etc.") + print("RESULTS") + print(" Directory containing Chad results or plaintext files, or a single file") + print(f" If a directory is specified, files ending with '{config.REPORT_EXTENSION}' will be ignored") + print(" -res, --results = chad_results | results.json | urls.txt | etc.") + print("PLAINTEXT") + print(" Treat all the results as plaintext files / server responses") + print(" -pt, --plaintext") + print("EXCLUDES") + print(" File containing regular expressions or a single regular expression to exclude content from the page") + print(" Applies only for extraction") + print(" -e, --excludes = regexes.txt | \"
.+?<\\/div>\" | etc.") + print("PLAYWRIGHT") + print(" Use Playwright's headless browser") + print(" Applies only for extraction") + print(" -p, --playwright") + print("PLAYWRIGHT WAIT") + print(" Wait time in seconds before fetching the page content") + print(" Applies only for extraction") + print(" -pw, --playwright-wait = 0.5 | 2 | 4 | etc.") + print("CONCURRENT REQUESTS") + print(" Number of concurrent requests") + print(" Default: 15") + print(" -cr, --concurrent-requests = 30 | 45 | etc.") + print("CONCURRENT REQUESTS PER DOMAIN") + print(" Number of concurrent requests per domain") + print(" Default: 5") + print(" -crd, --concurrent-requests-domain = 10 | 15 | etc.") + print("SLEEP") + print(" Sleep time in seconds between two consecutive requests to the same domain") + print(" -s, --sleep = 1.5 | 3 | etc.") + print("RANDOM SLEEP") + print(" Randomize the sleep time between requests to vary between '0.5 * sleep' and '1.5 * sleep'") + print(" -rs, --random-sleep") + print("AUTO THROTTLE") + print(" Auto throttle concurrent requests based on the load and latency") + print(" Sleep time is still respected") + print(" -at, --auto-throttle = 0.5 | 10 | 15 | 45 | etc.") + print("RETRIES") + print(" Number of retries per URL") + print(" Default: 2") + print(" -r, --retries = 0 | 4 | etc.") + print("REQUEST TIMEOUT") + print(" Request timeout in seconds") + print(" Default: 60") + print(" -rt, --request-timeout = 30 | 90 | etc.") + print("USER AGENTS") + print(" User agents to use") + print(" Default: random-all") + print(" -a, --user-agents = user_agents.txt | random(-all) | curl/3.30.1 | etc.") + print("PROXY") + print(" Web proxy to use") + print(" -x, --proxy = http://127.0.0.1:8080 | etc.") + print("OUT") + print(" Output file") + print(" -o, --out = report.json | etc.") + print("VERBOSE") + print(f" Create additional supporting output files that end with '{config.REPORT_EXTENSION}'") + print(" -v, --verbose") + print("DEBUG") + print(" Enable debug output") + print(" -dbg, --debug") + + def error(self, message): + if len(sys.argv) > 1: + print("Missing a mandatory option (-t, -res, -o) and/or optional (-pt, -e, -p, -pw, -cr, -crd, -s, -rs, -at, -r, -rt, -a, -x, -v, -dbg)") + print("Use -h or --help for more info") + else: + self.print_help() + exit() + +class Validate: + + def __init__(self): + """ + Class for validating and managing CLI arguments. + """ + self.__parser = MyArgParser() + self.__parser.add_argument("-t" , "--template" , required = True , type = str , default = "" ) + self.__parser.add_argument("-res", "--results" , required = True , type = str , default = "" ) + self.__parser.add_argument("-pt" , "--plaintext" , required = False, action = "store_true", default = False) + self.__parser.add_argument("-e" , "--excludes" , required = False, type = str , default = "" ) + self.__parser.add_argument("-p" , "--playwright" , required = False, action = "store_true", default = False) + self.__parser.add_argument("-pw" , "--playwright-wait" , required = False, type = str , default = "" ) + self.__parser.add_argument("-cr" , "--concurrent-requests" , required = False, type = str , default = "" ) + self.__parser.add_argument("-crd", "--concurrent-requests-domain", required = False, type = str , default = "" ) + self.__parser.add_argument("-s" , "--sleep" , required = False, type = str , default = "" ) + self.__parser.add_argument("-rs" , "--random-sleep" , required = False, action = "store_true", default = False) + self.__parser.add_argument("-at" , "--auto-throttle" , required = False, type = str , default = "" ) + self.__parser.add_argument("-r" , "--retries" , required = False, type = str , default = "" ) + self.__parser.add_argument("-rt" , "--request-timeout" , required = False, type = str , default = "" ) + self.__parser.add_argument("-a" , "--user-agents" , required = False, type = str , default = "" ) + self.__parser.add_argument("-x" , "--proxy" , required = False, type = str , default = "" ) + self.__parser.add_argument("-o" , "--out" , required = True , type = str , default = "" ) + self.__parser.add_argument("-v" , "--verbose" , required = False, action = "store_true", default = False) + self.__parser.add_argument("-dbg", "--debug" , required = False, action = "store_true", default = False) + + def validate_args(self): + """ + Validate and return the CLI arguments. + """ + self.__success = True + self.__args = self.__parser.parse_args() + self.__validate_template() + self.__validate_results() + self.__validate_excludes() + self.__validate_playwright_wait() + self.__validate_concurrent_requests() + self.__validate_concurrent_requests_domain() + self.__validate_sleep() + self.__validate_auto_throttle() + self.__validate_retries() + self.__validate_request_timeout() + self.__validate_user_agents() + self.__validate_proxy() + return self.__success, self.__args + + def __error(self, message: str): + """ + Set the success flag to 'False' to prevent the main task from executing, and print an error message. + """ + self.__success = False + general.print_error(message) + + # ------------------------------------ + + def __validate_template(self): + tmp = None + if not file.is_file(self.__args.template): + self.__error(f"\"{self.__args.template}\" does not exist") + else: + success, message = file.validate(self.__args.template) + if not success: + self.__error(message) + else: + tmp = file.read(self.__args.template) + if not tmp: + self.__error(f"No template was found in \"{self.__args.template}\"") + else: + tmp, message = template.deserialize(tmp) + if message: + self.__error(f"{message} from \"{self.__args.template}\"") + self.__args.template = tmp + + def __validate_results(self): + tmp = [] + if not directory.exists(self.__args.results): + self.__error(f"\"{self.__args.results}\" does not exist") + elif directory.is_directory(self.__args.results): + success, message = directory.validate(self.__args.results) + if not success: + self.__error(message) + else: + for path in directory.list_files(self.__args.results): + if not path.endswith(config.REPORT_EXTENSION): + tmp.append(path) + if not tmp: + self.__error(f"No valid files were found in \"{self.__args.results}\"") + else: + success, message = file.validate(self.__args.results) + if not success: + self.__error(message) + else: + tmp = [self.__args.results] + self.__args.results = tmp + + def __validate_excludes(self): + tmp = [] + if self.__args.excludes: + if file.is_file(self.__args.excludes): + success, message = file.validate(self.__args.excludes) + if not success: + self.__error(message) + else: + tmp = file.read_array(self.__args.excludes) + if not tmp: + self.__error(f"No regular expressions were found in \"{self.__args.excludes}\"") + else: + success, message = grep.validate_multiple(tmp) + if not success: + self.__error(message) + else: + success, message = grep.validate(self.__args.excludes) + if not success: + self.__error(message) + else: + tmp = [self.__args.excludes] + self.__args.excludes = tmp + + def __validate_playwright_wait(self): + tmp = 0 + if self.__args.playwright_wait: + tmp = general.to_float(self.__args.playwright_wait) + if tmp is None: + self.__error("Playwright's wait time must be numeric") + elif tmp <= 0: + self.__error("Playwright's wait time must be greater than zero") + self.__args.playwright_wait = tmp + + def __validate_concurrent_requests(self): + tmp = 15 + if self.__args.concurrent_requests: + if not self.__args.concurrent_requests.isdigit(): + self.__error("Number of concurrent requests must be numeric") + else: + tmp = int(self.__args.concurrent_requests) + if tmp <= 0: + self.__error("Number of concurrent requests must be greater than zero") + self.__args.concurrent_requests = tmp + + def __validate_concurrent_requests_domain(self): + tmp = 5 + if self.__args.concurrent_requests_domain: + if not self.__args.concurrent_requests_domain.isdigit(): + self.__error("Number of concurrent requests per domain must be numeric") + else: + tmp = int(self.__args.concurrent_requests_domain) + if tmp <= 0: + self.__error("Number of concurrent requests per domain must be greater than zero") + self.__args.concurrent_requests_domain = tmp + + def __validate_sleep(self,): + tmp = 0 + if self.__args.sleep: + tmp = general.to_float(self.__args.sleep) + if tmp is None: + self.__error("Sleep time between two consecutive requests must be numeric") + elif tmp <= 0: + self.__error("Sleep time between two consecutive requests must be greater than zero") + self.__args.sleep = tmp + + def __validate_auto_throttle(self): + tmp = 0 + if self.__args.auto_throttle: + tmp = general.to_float(self.__args.auto_throttle) + if tmp is None: + self.__error("Auto throttle must be numeric") + elif tmp <= 0: + self.__error("Auto throttle must be greater than zero") + self.__args.auto_throttle = tmp + + def __validate_retries(self): + tmp = 2 + if self.__args.retries: + if not self.__args.retries.isdigit(): + self.__error("Number of retries must be numeric") + else: + tmp = int(self.__args.retries) + if tmp <= 0: + self.__error("Number of retries must be greater than zero") + self.__args.retries = tmp + + def __validate_request_timeout(self): + tmp = 60 + if self.__args.request_timeout: + tmp = general.to_float(self.__args.request_timeout) + if tmp is None: + self.__error("Request timeout must be numeric") + elif tmp <= 0: + self.__error("Request timeout must be greater than zero") + self.__args.request_timeout = tmp + + def __validate_user_agents(self): + tmp = nagooglesearch.get_all_user_agents() + if self.__args.user_agents: + if file.is_file(self.__args.user_agents): + success, message = file.validate(self.__args.user_agents) + if not success: + self.__error(message) + else: + tmp = file.read_array(self.__args.user_agents) + if not tmp: + self.__error(f"No user agents were found in \"{self.__args.user_agents}\"") + else: + lower = self.__args.user_agents.lower() + if lower == "random-all": + pass + elif lower == "random": + tmp = [nagooglesearch.get_random_user_agent()] + else: + tmp = [self.__args.user_agents] + self.__args.user_agents = tmp + + def __validate_proxy(self): + if self.__args.proxy: + success, message = url.validate(self.__args.proxy) + if not success: + self.__error(message) diff --git a/src/dorks/social_media_dorks.txt b/src/dorks/social_media_dorks.txt new file mode 100644 index 0000000..175421d --- /dev/null +++ b/src/dorks/social_media_dorks.txt @@ -0,0 +1,8 @@ +intext:"t.me/" +intext:"discord.com/invite/" OR intext:"discord.gg/invite/" +intext:"youtube.com/c/" OR intext:"youtube.com/channel/" +intext:"twitter.com/" OR intext:"x.com/" +intext:"facebook.com/" +intext:"instagram.com/" +intext:"tiktok.com/" +intext:"linkedin.com/in/" OR intext:"linkedin.com/company/" diff --git a/src/templates/social_media_template.json b/src/templates/social_media_template.json new file mode 100644 index 0000000..70e42ba --- /dev/null +++ b/src/templates/social_media_template.json @@ -0,0 +1,63 @@ +{ + "telegram":{ + "extract":"t\\.me\\/(?:(?!(?:share)(?:(?:\\/|\\?|\\\\|\"|\\<)*$|(?:\\/|\\?|\\\\|\\\"|\\<)[\\s\\S]))[\\w\\d\\.\\_\\-\\+\\@]+)(?" + }, + "discord":{ + "extract":"discord\\.(?:com|gg)\\/invite\\/[\\w\\d\\.\\_\\-\\+\\@]+(?", + "validate_cookies":{ + "SOCS":"CAESEwgDEgk2OTk3ODk2MzcaAmVuIAEaBgiAn5S6Bg" + } + }, + "twitter":{ + "extract":"(?<=(?