diff --git a/core/colors.py b/core/colors.py
new file mode 100644
index 0000000..7e33051
--- /dev/null
+++ b/core/colors.py
@@ -0,0 +1,20 @@
+import sys
+
+colors = True # Output should be colored
+machine = sys.platform # Detecting the os of current system
+if machine.lower().startswith(('os', 'win', 'darwin', 'ios')):
+ colors = False # Colors shouldn't be displayed in mac & windows
+if not colors:
+ end = red = white = green = yellow = run = bad = good = info = que = ''
+else:
+ white = '\033[97m'
+ green = '\033[92m'
+ red = '\033[91m'
+ yellow = '\033[93m'
+ end = '\033[0m'
+ back = '\033[7;91m'
+ info = '\033[93m[!]\033[0m'
+ que = '\033[94m[?]\033[0m'
+ bad = '\033[91m[-]\033[0m'
+ good = '\033[92m[+]\033[0m'
+ run = '\033[97m[~]\033[0m'
diff --git a/core/config.py b/core/config.py
index 3103f7c..fd7a357 100644
--- a/core/config.py
+++ b/core/config.py
@@ -1,4 +1,7 @@
"""Configuration options for Photon."""
+
+verbose = False
+
intels = [
'facebook.com',
'github.com',
diff --git a/core/flash.py b/core/flash.py
new file mode 100644
index 0000000..cbaa22c
--- /dev/null
+++ b/core/flash.py
@@ -0,0 +1,53 @@
+from __future__ import print_function
+
+import sys
+from core.colors import info
+
+try:
+ import concurrent.futures
+except ImportError:
+ import threading
+
+def threader(function, *urls):
+ """Start multiple threads for a function."""
+ threads = []
+ # Because URLs is a tuple
+ urls = urls[0]
+ # Iterating over URLs
+ for url in urls:
+ task = threading.Thread(target=function, args=(url,))
+ threads.append(task)
+ # Start threads
+ for thread in threads:
+ thread.start()
+ # Wait for all threads to complete their work
+ for thread in threads:
+ thread.join()
+ # Delete threads
+ del threads[:]
+
+
+def flash(function, links, thread_count):
+ """Process the URLs and uses a threadpool to execute a function."""
+ # Convert links (set) to list
+ links = list(links)
+ if sys.version_info < (3, 2):
+ for begin in range(0, len(links), thread_count): # Range with step
+ end = begin + thread_count
+ splitted = links[begin:end]
+ threader(function, splitted)
+ progress = end
+ if progress > len(links): # Fix if overflow
+ progress = len(links)
+ print('\r%s Progress: %i/%i' % (info, progress, len(links)),
+ end='\r')
+ sys.stdout.flush()
+ else:
+ threadpool = concurrent.futures.ThreadPoolExecutor(
+ max_workers=thread_count)
+ futures = (threadpool.submit(function, link) for link in links)
+ for i, _ in enumerate(concurrent.futures.as_completed(futures)):
+ if i + 1 == len(links) or (i + 1) % thread_count == 0:
+ print('%s Progress: %i/%i' % (info, i + 1, len(links)),
+ end='\r')
+ print('')
\ No newline at end of file
diff --git a/core/mirror.py b/core/mirror.py
new file mode 100644
index 0000000..f5770ac
--- /dev/null
+++ b/core/mirror.py
@@ -0,0 +1,39 @@
+import os
+import re
+
+def mirror(url, response):
+ if response != 'dummy':
+ cleanUrl = url.replace('http://', '').replace('https://', '').rstrip('/')
+ parts = cleanUrl.split('?')[0].split('/')
+ root = parts[0]
+ webpage = parts[-1]
+ parts.remove(root)
+ try:
+ parts.remove(webpage)
+ except ValueError:
+ pass
+ prefix = root + '_mirror'
+ try:
+ os.mkdir(prefix)
+ except OSError:
+ pass
+ suffix = ''
+ if parts:
+ for directory in parts:
+ suffix += directory + '/'
+ try:
+ os.mkdir(prefix + '/' + suffix)
+ except OSError:
+ pass
+ path = prefix + '/' + suffix
+ trail = ''
+ if '.' not in webpage:
+ trail += '.html'
+ if webpage == root:
+ name = 'index.html'
+ else:
+ name = webpage
+ if len(url.split('?')) > 1:
+ trail += '?' + url.split('?')[1]
+ with open(path + name + trail, 'w+') as out_file:
+ out_file.write(response.encode('utf-8'))
diff --git a/core/requester.py b/core/requester.py
new file mode 100644
index 0000000..5d5c559
--- /dev/null
+++ b/core/requester.py
@@ -0,0 +1,82 @@
+import time
+import random
+import requests
+from requests import get, post
+from requests.exceptions import TooManyRedirects
+
+session = requests.Session()
+session.max_redirects = 3
+
+def requester(url, main_url=None, delay=0, cook={}, headers={}, timeout=10, host=None, ninja=False, user_agents=['Photon'], failed=[], processed=[]):
+ """Handle the requests and return the response body."""
+ # Mark the URL as crawled
+ processed.add(url)
+ # Pause/sleep the program for specified time
+ time.sleep(delay)
+
+ def normal(url):
+ """Default request"""
+ finalHeaders = headers or {
+ 'Host': host,
+ # Selecting a random user-agent
+ 'User-Agent': random.choice(user_agents),
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+ 'Accept-Language': 'en-US,en;q=0.5',
+ 'Accept-Encoding': 'gzip',
+ 'DNT': '1',
+ 'Connection': 'close',
+ }
+ try:
+ response = session.get(url, cookies=cook, headers=finalHeaders, verify=False,
+ timeout=timeout, stream=True)
+ except TooManyRedirects:
+ return 'dummy'
+ if 'text/html' in response.headers['content-type']:
+ if response.status_code != '404':
+ return response.text
+ else:
+ response.close()
+ failed.add(url)
+ return 'dummy'
+ else:
+ response.close()
+ return 'dummy'
+
+ def facebook(url):
+ """Interact with the developer.facebook.com API."""
+ return requests.get('https://developers.facebook.com/tools/debug/echo/?q=' + url,
+ verify=False).text
+
+ def pixlr(url):
+ """Interact with the pixlr.com API."""
+ if url == main_url:
+ # Because pixlr throws error if http://example.com is used
+ url = main_url + '/'
+ return requests.get('https://pixlr.com/proxy/?url=' + url,
+ headers={'Accept-Encoding' : 'gzip'}, verify=False).text
+
+ def code_beautify(url):
+ """Interact with the codebeautify.org API."""
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0',
+ 'Accept': 'text/plain, */*; q=0.01',
+ 'Accept-Encoding': 'gzip',
+ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+ 'Origin': 'https://codebeautify.org',
+ 'Connection': 'close',
+ }
+ return requests.post('https://codebeautify.com/URLService', headers=headers,
+ data='path=' + url, verify=False).text
+
+ def photopea(url):
+ """Interact with the www.photopea.com API."""
+ return requests.get(
+ 'https://www.photopea.com/mirror.php?url=' + url, verify=False).text
+
+ if ninja: # If the ninja mode is enabled
+ # Select a random request function i.e. random API
+ response = random.choice(
+ [photopea, normal, facebook, pixlr, code_beautify])(url)
+ return response or 'dummy'
+ else:
+ return normal(url)
\ No newline at end of file
diff --git a/core/updater.py b/core/updater.py
new file mode 100644
index 0000000..fa7882b
--- /dev/null
+++ b/core/updater.py
@@ -0,0 +1,38 @@
+import re
+import os
+from core.requester import requester
+from core.colors import run, que, good, green, end, info
+
+def updater():
+ """Update the current installation.
+
+ git clones the latest version and merges it with the current directory.
+ """
+ print('%s Checking for updates' % run)
+ # Changes must be separated by ;
+ changes = "cloning (mirroring) feature;fixed sitemap.xml parsing;reuse tcp connection to boost speed;handle redirect loops;csv export support;other minor bug fixes"
+ latest_commit = requester('https://raw.githubusercontent.com/s0md3v/Photon/master/photon.py', host='github.com').text
+ # Just a hack to see if a new version is available
+ if changes not in latest_commit:
+ changelog = re.search(r"changes = '''(.*?)'''", latest_commit)
+ # Splitting the changes to form a list
+ changelog = changelog.group(1).split(';')
+ print('%s A new version of Photon is available.' % good)
+ print('%s Changes:' % info)
+ for change in changelog: # print changes
+ print('%s>%s %s' % (green, end, change))
+
+ current_path = os.getcwd().split('/') # if you know it, you know it
+ folder = current_path[-1] # current directory name
+ path = '/'.join(current_path) # current directory path
+ choice = input('%s Would you like to update? [Y/n] ' % que).lower()
+
+ if choice != 'n':
+ print('%s Updating Photon' % run)
+ os.system('git clone --quiet https://github.com/s0md3v/Photon %s'
+ % (folder))
+ os.system('cp -r %s/%s/* %s && rm -r %s/%s/ 2>/dev/null'
+ % (path, folder, path, path, folder))
+ print('%s Update successful!' % good)
+ else:
+ print('%s Photon is up to date!' % good)
diff --git a/core/utils.py b/core/utils.py
new file mode 100644
index 0000000..bf75546
--- /dev/null
+++ b/core/utils.py
@@ -0,0 +1,125 @@
+import re
+import tld
+import math
+from core.config import verbose, badTypes
+from core.colors import info
+
+try:
+ from urllib.parse import urlparse
+except:
+ from urlparse import urlparse
+
+
+def regxy(pattern, response, supress_regex, custom):
+ """Extract a string based on regex pattern supplied by user."""
+ try:
+ matches = re.findall(r'%s' % pattern, response)
+ for match in matches:
+ verb('Custom regex', match)
+ custom.add(match)
+ except:
+ supress_regex = True
+
+
+def is_link(url, processed, files):
+ """Check whether an URL should be crawled or not."""
+ # File extension that don't need to be crawled and are files
+ # Whether the the url should be crawled or not
+ conclusion = False
+ # If the URL hasn't been crawled already
+ if url not in processed:
+ if url.split('.')[-1].lower() in badTypes:
+ files.add(url)
+ else:
+ return True
+ return conclusion
+
+
+def remove_regex(urls, regex):
+ """
+ Parse a list for non-matches to a regex.
+
+ Args:
+ urls: iterable of urls
+ custom_regex: string regex to be parsed for
+
+ Returns:
+ list of strings not matching regex
+ """
+
+ if not regex:
+ return urls
+
+ # To avoid iterating over the characters of a string
+ if not isinstance(urls, (list, set, tuple)):
+ urls = [urls]
+
+ try:
+ non_matching_urls = [url for url in urls if not re.search(regex, url)]
+ except TypeError:
+ return []
+
+ return non_matching_urls
+
+def writer(datasets, dataset_names, output_dir):
+ """Write the results."""
+ for dataset, dataset_name in zip(datasets, dataset_names):
+ if dataset:
+ filepath = output_dir + '/' + dataset_name + '.txt'
+ with open(filepath, 'w+') as out_file:
+ joined = '\n'.join(dataset)
+ out_file.write(str(joined.encode('utf-8')))
+ out_file.write('\n')
+
+def timer(diff, processed):
+ """Return the passed time."""
+ # Changes seconds into minutes and seconds
+ minutes, seconds = divmod(diff, 60)
+ try:
+ # Finds average time taken by requests
+ time_per_request = diff / float(len(processed))
+ except ZeroDivisionError:
+ time_per_request = 0
+ return minutes, seconds, time_per_request
+
+def entropy(string):
+ """Calculate the entropy of a string."""
+ entropy = 0
+ for number in range(256):
+ result = float(string.encode('utf-8').count(
+ chr(number)))/len(string.encode('utf-8'))
+ if result != 0:
+ entropy = entropy - result * math.log(result, 2)
+ return entropy
+
+def xmlParser(response):
+ """Extract links from .xml files."""
+ # Regex for extracting URLs
+ return re.findall(r'