Modular structure & Cloning Ability (v1.2.1)

s0md3v · Jan 26, 2019 · 93df322 · 93df322
2 parents 1b1f35a + b890528
commit 93df322
Show file tree

Hide file tree

Showing 9 changed files with 447 additions and 379 deletions.
diff --git a/core/colors.py b/core/colors.py
@@ -0,0 +1,20 @@
+import sys
+
+colors = True  # Output should be colored
+machine = sys.platform  # Detecting the os of current system
+if machine.lower().startswith(('os', 'win', 'darwin', 'ios')):
+    colors = False  # Colors shouldn't be displayed in mac & windows
+if not colors:
+    end = red = white = green = yellow = run = bad = good = info = que = ''
+else:
+    white = '\033[97m'
+    green = '\033[92m'
+    red = '\033[91m'
+    yellow = '\033[93m'
+    end = '\033[0m'
+    back = '\033[7;91m'
+    info = '\033[93m[!]\033[0m'
+    que = '\033[94m[?]\033[0m'
+    bad = '\033[91m[-]\033[0m'
+    good = '\033[92m[+]\033[0m'
+    run = '\033[97m[~]\033[0m'
diff --git a/core/config.py b/core/config.py
@@ -1,4 +1,7 @@
 """Configuration options for Photon."""
+
+verbose = False
+
 intels = [
     'facebook.com',
     'github.com',

diff --git a/core/flash.py b/core/flash.py
@@ -0,0 +1,53 @@
+from __future__ import print_function
+
+import sys
+from core.colors import info
+
+try:
+    import concurrent.futures
+except ImportError:
+    import threading
+
+def threader(function, *urls):
+    """Start multiple threads for a function."""
+    threads = []
+    # Because URLs is a tuple
+    urls = urls[0]
+    # Iterating over URLs
+    for url in urls:
+        task = threading.Thread(target=function, args=(url,))
+        threads.append(task)
+    # Start threads
+    for thread in threads:
+        thread.start()
+    # Wait for all threads to complete their work
+    for thread in threads:
+        thread.join()
+    # Delete threads
+    del threads[:]
+
+
+def flash(function, links, thread_count):
+    """Process the URLs and uses a threadpool to execute a function."""
+    # Convert links (set) to list
+    links = list(links)
+    if sys.version_info < (3, 2):
+        for begin in range(0, len(links), thread_count):  # Range with step
+            end = begin + thread_count
+            splitted = links[begin:end]
+            threader(function, splitted)
+            progress = end
+            if progress > len(links):  # Fix if overflow
+                progress = len(links)
+            print('\r%s Progress: %i/%i' % (info, progress, len(links)),
+                  end='\r')
+            sys.stdout.flush()
+    else:
+        threadpool = concurrent.futures.ThreadPoolExecutor(
+            max_workers=thread_count)
+        futures = (threadpool.submit(function, link) for link in links)
+        for i, _ in enumerate(concurrent.futures.as_completed(futures)):
+            if i + 1 == len(links) or (i + 1) % thread_count == 0:
+                print('%s Progress: %i/%i' % (info, i + 1, len(links)),
+                      end='\r')
+    print('')
diff --git a/core/mirror.py b/core/mirror.py
@@ -0,0 +1,39 @@
+import os
+import re
+
+def mirror(url, response):
+    if response != 'dummy':
+        cleanUrl = url.replace('http://', '').replace('https://', '').rstrip('/')
+        parts = cleanUrl.split('?')[0].split('/')
+        root = parts[0]
+        webpage = parts[-1]
+        parts.remove(root)
+        try:
+            parts.remove(webpage)
+        except ValueError:
+            pass
+        prefix = root + '_mirror'
+        try:
+            os.mkdir(prefix)
+        except OSError:
+            pass
+        suffix = ''
+        if parts:
+            for directory in parts:
+                suffix += directory + '/'
+                try:
+                    os.mkdir(prefix + '/' + suffix)
+                except OSError:
+                    pass
+        path = prefix + '/' + suffix
+        trail = ''
+        if '.' not in webpage:
+            trail += '.html'
+        if webpage == root:
+            name = 'index.html'
+        else:
+            name = webpage
+        if len(url.split('?')) > 1:
+            trail += '?' + url.split('?')[1]
+        with open(path + name + trail, 'w+') as out_file:
+            out_file.write(response.encode('utf-8'))
diff --git a/core/requester.py b/core/requester.py
@@ -0,0 +1,82 @@
+import time
+import random
+import requests
+from requests import get, post
+from requests.exceptions import TooManyRedirects
+
+session = requests.Session()
+session.max_redirects = 3
+
+def requester(url, main_url=None, delay=0, cook={}, headers={}, timeout=10, host=None, ninja=False, user_agents=['Photon'], failed=[], processed=[]):
+    """Handle the requests and return the response body."""
+    # Mark the URL as crawled
+    processed.add(url)
+    # Pause/sleep the program for specified time
+    time.sleep(delay)
+
+    def normal(url):
+        """Default request"""
+        finalHeaders = headers or {
+            'Host': host,
+            # Selecting a random user-agent
+            'User-Agent': random.choice(user_agents),
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Accept-Encoding': 'gzip',
+            'DNT': '1',
+            'Connection': 'close',
+        }
+        try:
+            response = session.get(url, cookies=cook, headers=finalHeaders, verify=False,
+                       timeout=timeout, stream=True)
+        except TooManyRedirects:
+            return 'dummy'
+        if 'text/html' in response.headers['content-type']:
+            if response.status_code != '404':
+                return response.text
+            else:
+                response.close()
+                failed.add(url)
+                return 'dummy'
+        else:
+            response.close()
+            return 'dummy'
+
+    def facebook(url):
+        """Interact with the developer.facebook.com API."""
+        return requests.get('https://developers.facebook.com/tools/debug/echo/?q=' + url,
+                   verify=False).text
+
+    def pixlr(url):
+        """Interact with the pixlr.com API."""
+        if url == main_url:
+            # Because pixlr throws error if http://example.com is used
+            url = main_url + '/'
+        return requests.get('https://pixlr.com/proxy/?url=' + url,
+                   headers={'Accept-Encoding' : 'gzip'}, verify=False).text
+
+    def code_beautify(url):
+        """Interact with the codebeautify.org API."""
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0',
+            'Accept': 'text/plain, */*; q=0.01',
+            'Accept-Encoding': 'gzip',
+            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+            'Origin': 'https://codebeautify.org',
+            'Connection': 'close',
+        }
+        return requests.post('https://codebeautify.com/URLService', headers=headers,
+                    data='path=' + url, verify=False).text
+
+    def photopea(url):
+        """Interact with the www.photopea.com API."""
+        return requests.get(
+            'https://www.photopea.com/mirror.php?url=' + url, verify=False).text
+
+    if ninja:  # If the ninja mode is enabled
+        # Select a random request function i.e. random API
+        response = random.choice(
+            [photopea, normal, facebook, pixlr, code_beautify])(url)
+        return response or 'dummy'
+    else:
+        return normal(url)
diff --git a/core/updater.py b/core/updater.py
@@ -0,0 +1,38 @@
+import re
+import os
+from core.requester import requester
+from core.colors import run, que, good, green, end, info
+
+def updater():
+    """Update the current installation.
+
+    git clones the latest version and merges it with the current directory.
+    """
+    print('%s Checking for updates' % run)
+    # Changes must be separated by ;
+    changes = "cloning (mirroring) feature;fixed sitemap.xml parsing;reuse tcp connection to boost speed;handle redirect loops;csv export support;other minor bug fixes"
+    latest_commit = requester('https://raw.githubusercontent.com/s0md3v/Photon/master/photon.py', host='github.com').text
+    # Just a hack to see if a new version is available
+    if changes not in latest_commit:
+        changelog = re.search(r"changes = '''(.*?)'''", latest_commit)
+        # Splitting the changes to form a list
+        changelog = changelog.group(1).split(';')
+        print('%s A new version of Photon is available.' % good)
+        print('%s Changes:' % info)
+        for change in changelog: # print changes
+            print('%s>%s %s' % (green, end, change))
+
+        current_path = os.getcwd().split('/') # if you know it, you know it
+        folder = current_path[-1] # current directory name
+        path = '/'.join(current_path) # current directory path
+        choice = input('%s Would you like to update? [Y/n] ' % que).lower()
+
+        if choice != 'n':
+            print('%s Updating Photon' % run)
+            os.system('git clone --quiet https://github.com/s0md3v/Photon %s'
+                      % (folder))
+            os.system('cp -r %s/%s/* %s && rm -r %s/%s/ 2>/dev/null'
+                      % (path, folder, path, path, folder))
+            print('%s Update successful!' % good)
+    else:
+        print('%s Photon is up to date!' % good)
diff --git a/core/utils.py b/core/utils.py
@@ -0,0 +1,125 @@
+import re
+import tld
+import math
+from core.config import verbose, badTypes
+from core.colors import info
+
+try:
+    from urllib.parse import urlparse
+except:
+    from urlparse import urlparse
+
+
+def regxy(pattern, response, supress_regex, custom):
+    """Extract a string based on regex pattern supplied by user."""
+    try:
+        matches = re.findall(r'%s' % pattern, response)
+        for match in matches:
+            verb('Custom regex', match)
+            custom.add(match)
+    except:
+        supress_regex = True
+
+
+def is_link(url, processed, files):
+    """Check whether an URL should be crawled or not."""
+    # File extension that don't need to be crawled and are files
+    # Whether the the url should be crawled or not
+    conclusion = False
+    # If the URL hasn't been crawled already
+    if url not in processed:
+        if url.split('.')[-1].lower() in badTypes:
+            files.add(url)
+        else:
+            return True
+    return conclusion
+
+
+def remove_regex(urls, regex):
+    """
+    Parse a list for non-matches to a regex.
+
+    Args:
+        urls: iterable of urls
+        custom_regex: string regex to be parsed for
+
+    Returns:
+        list of strings not matching regex
+    """
+
+    if not regex:
+        return urls
+
+    # To avoid iterating over the characters of a string
+    if not isinstance(urls, (list, set, tuple)):
+        urls = [urls]
+
+    try:
+        non_matching_urls = [url for url in urls if not re.search(regex, url)]
+    except TypeError:
+        return []
+
+    return non_matching_urls
+
+def writer(datasets, dataset_names, output_dir):
+    """Write the results."""
+    for dataset, dataset_name in zip(datasets, dataset_names):
+        if dataset:
+            filepath = output_dir + '/' + dataset_name + '.txt'
+            with open(filepath, 'w+') as out_file:
+                joined = '\n'.join(dataset)
+                out_file.write(str(joined.encode('utf-8')))
+                out_file.write('\n')
+
+def timer(diff, processed):
+    """Return the passed time."""
+    # Changes seconds into minutes and seconds
+    minutes, seconds = divmod(diff, 60)
+    try:
+        # Finds average time taken by requests
+        time_per_request = diff / float(len(processed))
+    except ZeroDivisionError:
+        time_per_request = 0
+    return minutes, seconds, time_per_request
+
+def entropy(string):
+    """Calculate the entropy of a string."""
+    entropy = 0
+    for number in range(256):
+        result = float(string.encode('utf-8').count(
+            chr(number)))/len(string.encode('utf-8'))
+        if result != 0:
+            entropy = entropy - result * math.log(result, 2)
+    return entropy
+
+def xmlParser(response):
+    """Extract links from .xml files."""
+    # Regex for extracting URLs
+    return re.findall(r'<loc>(.*?)</loc>', response)
+
+def verb(kind, string):
+    """Enable verbose output."""
+    if verbose:
+        print('%s %s: %s' % (info, kind, string))
+
+def extract_headers(headers):
+    """This function extracts valid headers from interactive input."""
+    sorted_headers = {}
+    matches = re.findall(r'(.*):\s(.*)', headers)
+    for match in matches:
+        header = match[0]
+        value = match[1]
+        try:
+            if value[-1] == ',':
+                value = value[:-1]
+            sorted_headers[header] = value
+        except IndexError:
+            pass
+    return sorted_headers
+
+def top_level(url):
+    """Extract the top level domain from an URL."""
+    ext = tld.get_tld(url, fix_protocol=True)
+    toplevel = '.'.join(urlparse(url).netloc.split('.')[-2:]).split(
+        ext)[0] + ext
+    return toplevel