diff --git a/core/colors.py b/core/colors.py new file mode 100644 index 0000000..7e33051 --- /dev/null +++ b/core/colors.py @@ -0,0 +1,20 @@ +import sys + +colors = True # Output should be colored +machine = sys.platform # Detecting the os of current system +if machine.lower().startswith(('os', 'win', 'darwin', 'ios')): + colors = False # Colors shouldn't be displayed in mac & windows +if not colors: + end = red = white = green = yellow = run = bad = good = info = que = '' +else: + white = '\033[97m' + green = '\033[92m' + red = '\033[91m' + yellow = '\033[93m' + end = '\033[0m' + back = '\033[7;91m' + info = '\033[93m[!]\033[0m' + que = '\033[94m[?]\033[0m' + bad = '\033[91m[-]\033[0m' + good = '\033[92m[+]\033[0m' + run = '\033[97m[~]\033[0m' diff --git a/core/config.py b/core/config.py index 3103f7c..fd7a357 100644 --- a/core/config.py +++ b/core/config.py @@ -1,4 +1,7 @@ """Configuration options for Photon.""" + +verbose = False + intels = [ 'facebook.com', 'github.com', diff --git a/core/flash.py b/core/flash.py new file mode 100644 index 0000000..cbaa22c --- /dev/null +++ b/core/flash.py @@ -0,0 +1,53 @@ +from __future__ import print_function + +import sys +from core.colors import info + +try: + import concurrent.futures +except ImportError: + import threading + +def threader(function, *urls): + """Start multiple threads for a function.""" + threads = [] + # Because URLs is a tuple + urls = urls[0] + # Iterating over URLs + for url in urls: + task = threading.Thread(target=function, args=(url,)) + threads.append(task) + # Start threads + for thread in threads: + thread.start() + # Wait for all threads to complete their work + for thread in threads: + thread.join() + # Delete threads + del threads[:] + + +def flash(function, links, thread_count): + """Process the URLs and uses a threadpool to execute a function.""" + # Convert links (set) to list + links = list(links) + if sys.version_info < (3, 2): + for begin in range(0, len(links), thread_count): # Range with step + end = begin + thread_count + splitted = links[begin:end] + threader(function, splitted) + progress = end + if progress > len(links): # Fix if overflow + progress = len(links) + print('\r%s Progress: %i/%i' % (info, progress, len(links)), + end='\r') + sys.stdout.flush() + else: + threadpool = concurrent.futures.ThreadPoolExecutor( + max_workers=thread_count) + futures = (threadpool.submit(function, link) for link in links) + for i, _ in enumerate(concurrent.futures.as_completed(futures)): + if i + 1 == len(links) or (i + 1) % thread_count == 0: + print('%s Progress: %i/%i' % (info, i + 1, len(links)), + end='\r') + print('') \ No newline at end of file diff --git a/core/mirror.py b/core/mirror.py new file mode 100644 index 0000000..f5770ac --- /dev/null +++ b/core/mirror.py @@ -0,0 +1,39 @@ +import os +import re + +def mirror(url, response): + if response != 'dummy': + cleanUrl = url.replace('http://', '').replace('https://', '').rstrip('/') + parts = cleanUrl.split('?')[0].split('/') + root = parts[0] + webpage = parts[-1] + parts.remove(root) + try: + parts.remove(webpage) + except ValueError: + pass + prefix = root + '_mirror' + try: + os.mkdir(prefix) + except OSError: + pass + suffix = '' + if parts: + for directory in parts: + suffix += directory + '/' + try: + os.mkdir(prefix + '/' + suffix) + except OSError: + pass + path = prefix + '/' + suffix + trail = '' + if '.' not in webpage: + trail += '.html' + if webpage == root: + name = 'index.html' + else: + name = webpage + if len(url.split('?')) > 1: + trail += '?' + url.split('?')[1] + with open(path + name + trail, 'w+') as out_file: + out_file.write(response.encode('utf-8')) diff --git a/core/requester.py b/core/requester.py new file mode 100644 index 0000000..5d5c559 --- /dev/null +++ b/core/requester.py @@ -0,0 +1,82 @@ +import time +import random +import requests +from requests import get, post +from requests.exceptions import TooManyRedirects + +session = requests.Session() +session.max_redirects = 3 + +def requester(url, main_url=None, delay=0, cook={}, headers={}, timeout=10, host=None, ninja=False, user_agents=['Photon'], failed=[], processed=[]): + """Handle the requests and return the response body.""" + # Mark the URL as crawled + processed.add(url) + # Pause/sleep the program for specified time + time.sleep(delay) + + def normal(url): + """Default request""" + finalHeaders = headers or { + 'Host': host, + # Selecting a random user-agent + 'User-Agent': random.choice(user_agents), + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip', + 'DNT': '1', + 'Connection': 'close', + } + try: + response = session.get(url, cookies=cook, headers=finalHeaders, verify=False, + timeout=timeout, stream=True) + except TooManyRedirects: + return 'dummy' + if 'text/html' in response.headers['content-type']: + if response.status_code != '404': + return response.text + else: + response.close() + failed.add(url) + return 'dummy' + else: + response.close() + return 'dummy' + + def facebook(url): + """Interact with the developer.facebook.com API.""" + return requests.get('https://developers.facebook.com/tools/debug/echo/?q=' + url, + verify=False).text + + def pixlr(url): + """Interact with the pixlr.com API.""" + if url == main_url: + # Because pixlr throws error if http://example.com is used + url = main_url + '/' + return requests.get('https://pixlr.com/proxy/?url=' + url, + headers={'Accept-Encoding' : 'gzip'}, verify=False).text + + def code_beautify(url): + """Interact with the codebeautify.org API.""" + headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0', + 'Accept': 'text/plain, */*; q=0.01', + 'Accept-Encoding': 'gzip', + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Origin': 'https://codebeautify.org', + 'Connection': 'close', + } + return requests.post('https://codebeautify.com/URLService', headers=headers, + data='path=' + url, verify=False).text + + def photopea(url): + """Interact with the www.photopea.com API.""" + return requests.get( + 'https://www.photopea.com/mirror.php?url=' + url, verify=False).text + + if ninja: # If the ninja mode is enabled + # Select a random request function i.e. random API + response = random.choice( + [photopea, normal, facebook, pixlr, code_beautify])(url) + return response or 'dummy' + else: + return normal(url) \ No newline at end of file diff --git a/core/updater.py b/core/updater.py new file mode 100644 index 0000000..fa7882b --- /dev/null +++ b/core/updater.py @@ -0,0 +1,38 @@ +import re +import os +from core.requester import requester +from core.colors import run, que, good, green, end, info + +def updater(): + """Update the current installation. + + git clones the latest version and merges it with the current directory. + """ + print('%s Checking for updates' % run) + # Changes must be separated by ; + changes = "cloning (mirroring) feature;fixed sitemap.xml parsing;reuse tcp connection to boost speed;handle redirect loops;csv export support;other minor bug fixes" + latest_commit = requester('https://raw.githubusercontent.com/s0md3v/Photon/master/photon.py', host='github.com').text + # Just a hack to see if a new version is available + if changes not in latest_commit: + changelog = re.search(r"changes = '''(.*?)'''", latest_commit) + # Splitting the changes to form a list + changelog = changelog.group(1).split(';') + print('%s A new version of Photon is available.' % good) + print('%s Changes:' % info) + for change in changelog: # print changes + print('%s>%s %s' % (green, end, change)) + + current_path = os.getcwd().split('/') # if you know it, you know it + folder = current_path[-1] # current directory name + path = '/'.join(current_path) # current directory path + choice = input('%s Would you like to update? [Y/n] ' % que).lower() + + if choice != 'n': + print('%s Updating Photon' % run) + os.system('git clone --quiet https://github.com/s0md3v/Photon %s' + % (folder)) + os.system('cp -r %s/%s/* %s && rm -r %s/%s/ 2>/dev/null' + % (path, folder, path, path, folder)) + print('%s Update successful!' % good) + else: + print('%s Photon is up to date!' % good) diff --git a/core/utils.py b/core/utils.py new file mode 100644 index 0000000..bf75546 --- /dev/null +++ b/core/utils.py @@ -0,0 +1,125 @@ +import re +import tld +import math +from core.config import verbose, badTypes +from core.colors import info + +try: + from urllib.parse import urlparse +except: + from urlparse import urlparse + + +def regxy(pattern, response, supress_regex, custom): + """Extract a string based on regex pattern supplied by user.""" + try: + matches = re.findall(r'%s' % pattern, response) + for match in matches: + verb('Custom regex', match) + custom.add(match) + except: + supress_regex = True + + +def is_link(url, processed, files): + """Check whether an URL should be crawled or not.""" + # File extension that don't need to be crawled and are files + # Whether the the url should be crawled or not + conclusion = False + # If the URL hasn't been crawled already + if url not in processed: + if url.split('.')[-1].lower() in badTypes: + files.add(url) + else: + return True + return conclusion + + +def remove_regex(urls, regex): + """ + Parse a list for non-matches to a regex. + + Args: + urls: iterable of urls + custom_regex: string regex to be parsed for + + Returns: + list of strings not matching regex + """ + + if not regex: + return urls + + # To avoid iterating over the characters of a string + if not isinstance(urls, (list, set, tuple)): + urls = [urls] + + try: + non_matching_urls = [url for url in urls if not re.search(regex, url)] + except TypeError: + return [] + + return non_matching_urls + +def writer(datasets, dataset_names, output_dir): + """Write the results.""" + for dataset, dataset_name in zip(datasets, dataset_names): + if dataset: + filepath = output_dir + '/' + dataset_name + '.txt' + with open(filepath, 'w+') as out_file: + joined = '\n'.join(dataset) + out_file.write(str(joined.encode('utf-8'))) + out_file.write('\n') + +def timer(diff, processed): + """Return the passed time.""" + # Changes seconds into minutes and seconds + minutes, seconds = divmod(diff, 60) + try: + # Finds average time taken by requests + time_per_request = diff / float(len(processed)) + except ZeroDivisionError: + time_per_request = 0 + return minutes, seconds, time_per_request + +def entropy(string): + """Calculate the entropy of a string.""" + entropy = 0 + for number in range(256): + result = float(string.encode('utf-8').count( + chr(number)))/len(string.encode('utf-8')) + if result != 0: + entropy = entropy - result * math.log(result, 2) + return entropy + +def xmlParser(response): + """Extract links from .xml files.""" + # Regex for extracting URLs + return re.findall(r'(.*?)', response) + +def verb(kind, string): + """Enable verbose output.""" + if verbose: + print('%s %s: %s' % (info, kind, string)) + +def extract_headers(headers): + """This function extracts valid headers from interactive input.""" + sorted_headers = {} + matches = re.findall(r'(.*):\s(.*)', headers) + for match in matches: + header = match[0] + value = match[1] + try: + if value[-1] == ',': + value = value[:-1] + sorted_headers[header] = value + except IndexError: + pass + return sorted_headers + +def top_level(url): + """Extract the top level domain from an URL.""" + ext = tld.get_tld(url, fix_protocol=True) + toplevel = '.'.join(urlparse(url).netloc.split('.')[-2:]).split( + ext)[0] + ext + return toplevel diff --git a/core/zap.py b/core/zap.py new file mode 100644 index 0000000..5363a65 --- /dev/null +++ b/core/zap.py @@ -0,0 +1,54 @@ +import re +import requests +from core.requester import requester +from core.utils import verb, xmlParser +from core.colors import run, good +from plugins.wayback import time_machine + +def zap(inputUrl, archive, domain, host, internal, robots): + """Extract links from robots.txt and sitemap.xml.""" + if archive: + from plugins.wayback import time_machine + print('%s Fetching URLs from archive.org' % run) + if False: + archived_urls = time_machine(domain, 'domain') + else: + archived_urls = time_machine(host, 'host') + print('%s Retrieved %i URLs from archive.org' % ( + good, len(archived_urls) - 1)) + for url in archived_urls: + verb('Internal page', url) + internal.add(url) + # Makes request to robots.txt + response = requests.get(inputUrl + '/robots.txt').text + # Making sure robots.txt isn't some fancy 404 page + if '%s %s' % (green, end, change)) - - current_path = os.getcwd().split('/') # if you know it, you know it - folder = current_path[-1] # current directory name - path = '/'.join(current_path) # current directory path - choice = input('%s Would you like to update? [Y/n] ' % que).lower() - - if choice != 'n': - print('%s Updating Photon' % run) - os.system('git clone --quiet https://github.com/s0md3v/Photon %s' - % (folder)) - os.system('cp -r %s/%s/* %s && rm -r %s/%s/ 2>/dev/null' - % (path, folder, path, path, folder)) - print('%s Update successful!' % good) - else: - print('%s Photon is up to date!' % good) - - -session = requests.Session() -session.max_redirects = 3 - # If the user has supplied --update argument if args.update: - update() + updater() quit() # If the user has supplied a URL @@ -165,6 +112,7 @@ def update(): print('\n' + parser.format_help().lower()) quit() +clone = args.clone headers = args.headers # prompt for headers verbose = args.verbose # verbose output delay = args.delay or 0 # Delay between requests @@ -196,22 +144,7 @@ def update(): bad_intel = set() # Unclean intel urls bad_scripts = set() # Unclean javascript file urls - -def extract_headers(headers): - """This function extracts valid headers from interactive input.""" - sorted_headers = {} - matches = findall(r'(.*):\s(.*)', headers) - for match in matches: - header = match[0] - value = match[1] - try: - if value[-1] == ',': - value = value[:-1] - sorted_headers[header] = value - except IndexError: - pass - return sorted_headers - +core.config.verbose = verbose if headers: headers = extract_headers(prompt()) @@ -234,14 +167,6 @@ def extract_headers(headers): output_dir = args.output or host - -def top_level(url): - """Extract the top level domain from an URL.""" - ext = tld.get_tld(host, fix_protocol=True) - toplevel = '.'.join(urlparse(main_url).netloc.split('.')[-2:]).split( - ext)[0] + ext - return toplevel - try: domain = top_level(main_url) except: @@ -254,200 +179,11 @@ def top_level(url): user_agents = [agent.strip('\n') for agent in uas] -def requester(url): - """Handle the requests and return the response body.""" - # Mark the URL as crawled - processed.add(url) - # Pause/sleep the program for specified time - time.sleep(delay) - - def normal(url): - """Default request""" - finalHeaders = headers or { - 'Host': host, - # Selecting a random user-agent - 'User-Agent': random.choice(user_agents), - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - 'Accept-Encoding': 'gzip', - 'DNT': '1', - 'Connection': 'close', - } - try: - response = session.get(url, cookies=cook, headers=finalHeaders, verify=False, - timeout=timeout, stream=True) - except TooManyRedirects: - return 'dummy' - if 'text/html' in response.headers['content-type']: - if response.status_code != '404': - return response.text - else: - response.close() - failed.add(url) - return 'dummy' - else: - response.close() - return 'dummy' - - def facebook(url): - """Interact with the developer.facebook.com API.""" - return requests.get('https://developers.facebook.com/tools/debug/echo/?q=' + url, - verify=False).text - - def pixlr(url): - """Interact with the pixlr.com API.""" - if url == main_url: - # Because pixlr throws error if http://example.com is used - url = main_url + '/' - return requests.get('https://pixlr.com/proxy/?url=' + url, - headers={'Accept-Encoding' : 'gzip'}, verify=False).text - - def code_beautify(url): - """Interact with the codebeautify.org API.""" - headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0', - 'Accept': 'text/plain, */*; q=0.01', - 'Accept-Encoding': 'gzip', - 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', - 'Origin': 'https://codebeautify.org', - 'Connection': 'close', - } - return requests.post('https://codebeautify.com/URLService', headers=headers, - data='path=' + url, verify=False).text - - def photopea(url): - """Interact with the www.photopea.com API.""" - return requests.get( - 'https://www.photopea.com/mirror.php?url=' + url, verify=False).text - - if ninja: # If the ninja mode is enabled - # Select a random request function i.e. random API - response = random.choice( - [photopea, normal, facebook, pixlr, code_beautify])(url) - return response or 'dummy' - else: - return normal(url) - - -def verb(kind, string): - """Enable verbose output.""" - if verbose: - print('%s %s: %s' % (info, kind, string)) - - -def xmlParser(response): - """Extract links from .xml files.""" - # Regex for extracting URLs - return findall(r'(.*?)', response) - - -def zap(inputUrl): - """Extract links from robots.txt and sitemap.xml.""" - if args.archive: - from plugins.wayback import time_machine - print('%s Fetching URLs from archive.org' % run) - if False: - archived_urls = time_machine(domain, 'domain') - else: - archived_urls = time_machine(host, 'host') - print('%s Retrieved %i URLs from archive.org' % ( - good, len(archived_urls) - 1)) - for url in archived_urls: - verb('Internal page', url) - internal.add(url) - # Makes request to robots.txt - response = requests.get(inputUrl + '/robots.txt', verify=False).text - # Making sure robots.txt isn't some fancy 404 page - if ']+)', response) + matches = re.findall(r'<(script|SCRIPT).*(src|SRC)=([^\s>]+)', response) for match in matches: match = match[2].replace('\'', '').replace('"', '') verb('JS file', match) bad_scripts.add(match) - -def entropy(payload): - """Calculate the entropy of a string.""" - entropy = 0 - for number in range(256): - result = float(payload.encode('utf-8').count( - chr(number)))/len(payload.encode('utf-8')) - if result != 0: - entropy = entropy - result * log(result, 2) - return entropy - - def extractor(url): """Extract details from the response body.""" - response = requester(url) - matches = findall(r'<[aA].*(href|HREF)=([^\s>]+)', response) + response = requester(url, main_url, delay, cook, headers, timeout, host, ninja, user_agents, failed, processed) + if clone: + mirror(url, response) + matches = re.findall(r'<[aA].*(href|HREF)=([^\s>]+)', response) for link in matches: # Remove everything after a "#" to deal with in-page anchors link = link[1].replace('\'', '').replace('"', '').split('#')[0] # Checks if the URLs should be crawled - if is_link(link): + if is_link(link, processed, files): if link[:4] == 'http': if link.startswith(main_url): verb('Internal page', link) @@ -509,9 +235,9 @@ def extractor(url): intel_extractor(response) js_extractor(response) if args.regex and not supress_regex: - regxy(args.regex, response) + regxy(args.regex, response, supress_regex, custom) if api: - matches = findall(r'[\w-]{16,45}', response) + matches = re.findall(r'[\w-]{16,45}', response) for match in matches: if entropy(match) >= 4: verb('Key', match) @@ -520,68 +246,24 @@ def extractor(url): def jscanner(url): """Extract endpoints from JavaScript code.""" - response = requester(url) + response = requester(url, main_url, delay, cook, headers, timeout, host, ninja, user_agents, failed, processed) # Extract URLs/endpoints - matches = findall(r'[\'"](/.*?)[\'"]|[\'"](http.*?)[\'"]', response) + matches = re.findall(r'[\'"](/.*?)[\'"]|[\'"](http.*?)[\'"]', response) # Iterate over the matches, match is a tuple for match in matches: # Combining the items because one of them is always empty match = match[0] + match[1] # Making sure it's not some JavaScript code - if not search(r'[}{><"\']', match) and not match == '/': + if not re.search(r'[}{><"\']', match) and not match == '/': verb('JS endpoint', match) endpoints.add(match) -def threader(function, *urls): - """Start multiple threads for a function.""" - threads = [] - # Because URLs is a tuple - urls = urls[0] - # Iterating over URLs - for url in urls: - task = threading.Thread(target=function, args=(url,)) - threads.append(task) - # Start threads - for thread in threads: - thread.start() - # Wait for all threads to complete their work - for thread in threads: - thread.join() - # Delete threads - del threads[:] - - -def flash(function, links): - """Process the URLs and uses a threadpool to execute a function.""" - # Convert links (set) to list - links = list(links) - if sys.version_info < (3, 2): - for begin in range(0, len(links), thread_count): # Range with step - end = begin + thread_count - splitted = links[begin:end] - threader(function, splitted) - progress = end - if progress > len(links): # Fix if overflow - progress = len(links) - print('\r%s Progress: %i/%i' % (info, progress, len(links)), - end='\r') - sys.stdout.flush() - else: - threadpool = concurrent.futures.ThreadPoolExecutor( - max_workers=thread_count) - futures = (threadpool.submit(function, link) for link in links) - for i, _ in enumerate(concurrent.futures.as_completed(futures)): - if i + 1 == len(links) or (i + 1) % thread_count == 0: - print('%s Progress: %i/%i' % (info, i + 1, len(links)), - end='\r') - print('') - # Records the time at which crawling started then = time.time() # Step 1. Extract urls from robots.txt & sitemap.xml -zap(main_url) +zap(main_url, args.archive, domain, host, internal, robots) # This is so the level 1 emails are parsed as well internal = set(remove_regex(internal, args.exclude)) @@ -599,7 +281,7 @@ def flash(function, links): break print('%s Level %i: %i URLs' % (run, level + 1, len(links))) try: - flash(extractor, links) + flash(extractor, links, thread_count) except KeyboardInterrupt: print('') break @@ -614,7 +296,7 @@ def flash(function, links): scripts.add(main_url + '/' + match) # Step 3. Scan the JavaScript files for endpoints print('%s Crawling %i JavaScript files' % (run, len(scripts))) - flash(jscanner, scripts) + flash(jscanner, scripts, thread_count) for url in internal: if '=' in url: @@ -626,7 +308,7 @@ def flash(function, links): intel.add(x) for url in external: try: - if tld.get_tld(url, fix_protocol=True) in intels: + if top_level(url, fix_protocol=True) in intels: intel.add(url) except: pass @@ -635,19 +317,7 @@ def flash(function, links): now = time.time() # Finds total time taken diff = (now - then) - - -def timer(diff): - """Return the passed time.""" - # Changes seconds into minutes and seconds - minutes, seconds = divmod(diff, 60) - try: - # Finds average time taken by requests - time_per_request = diff / float(len(processed)) - except ZeroDivisionError: - time_per_request = 0 - return minutes, seconds, time_per_request -minutes, seconds, time_per_request = timer(diff) +minutes, seconds, time_per_request = timer(diff, processed) # Step 4. Save the results if not os.path.exists(output_dir): # if the directory doesn't exist @@ -658,22 +328,6 @@ def timer(diff): dataset_names = ['files', 'intel', 'robots', 'custom', 'failed', 'internal', 'scripts', 'external', 'fuzzable', 'endpoints', 'keys'] -def writer(datasets, dataset_names, output_dir): - """Write the results.""" - for dataset, dataset_name in zip(datasets, dataset_names): - if dataset: - filepath = output_dir + '/' + dataset_name + '.txt' - if python3: - with open(filepath, 'w+', encoding='utf8') as out_file: - out_file.write(str('\n'.join(dataset))) - out_file.write('\n') - else: - with open(filepath, 'w+') as out_file: - joined = '\n'.join(dataset) - out_file.write(str(joined.encode('utf-8'))) - out_file.write('\n') - - writer(datasets, dataset_names, output_dir) # Printing out results print(('%s-%s' % (red, end)) * 50)