Skip to content

Commit

Permalink
Modular structure & Cloning Ability (v1.2.1)
Browse files Browse the repository at this point in the history
  • Loading branch information
s0md3v authored Jan 26, 2019
2 parents 1b1f35a + b890528 commit 93df322
Show file tree
Hide file tree
Showing 9 changed files with 447 additions and 379 deletions.
20 changes: 20 additions & 0 deletions core/colors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import sys

colors = True # Output should be colored
machine = sys.platform # Detecting the os of current system
if machine.lower().startswith(('os', 'win', 'darwin', 'ios')):
colors = False # Colors shouldn't be displayed in mac & windows
if not colors:
end = red = white = green = yellow = run = bad = good = info = que = ''
else:
white = '\033[97m'
green = '\033[92m'
red = '\033[91m'
yellow = '\033[93m'
end = '\033[0m'
back = '\033[7;91m'
info = '\033[93m[!]\033[0m'
que = '\033[94m[?]\033[0m'
bad = '\033[91m[-]\033[0m'
good = '\033[92m[+]\033[0m'
run = '\033[97m[~]\033[0m'
3 changes: 3 additions & 0 deletions core/config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
"""Configuration options for Photon."""

verbose = False

intels = [
'facebook.com',
'github.com',
Expand Down
53 changes: 53 additions & 0 deletions core/flash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from __future__ import print_function

import sys
from core.colors import info

try:
import concurrent.futures
except ImportError:
import threading

def threader(function, *urls):
"""Start multiple threads for a function."""
threads = []
# Because URLs is a tuple
urls = urls[0]
# Iterating over URLs
for url in urls:
task = threading.Thread(target=function, args=(url,))
threads.append(task)
# Start threads
for thread in threads:
thread.start()
# Wait for all threads to complete their work
for thread in threads:
thread.join()
# Delete threads
del threads[:]


def flash(function, links, thread_count):
"""Process the URLs and uses a threadpool to execute a function."""
# Convert links (set) to list
links = list(links)
if sys.version_info < (3, 2):
for begin in range(0, len(links), thread_count): # Range with step
end = begin + thread_count
splitted = links[begin:end]
threader(function, splitted)
progress = end
if progress > len(links): # Fix if overflow
progress = len(links)
print('\r%s Progress: %i/%i' % (info, progress, len(links)),
end='\r')
sys.stdout.flush()
else:
threadpool = concurrent.futures.ThreadPoolExecutor(
max_workers=thread_count)
futures = (threadpool.submit(function, link) for link in links)
for i, _ in enumerate(concurrent.futures.as_completed(futures)):
if i + 1 == len(links) or (i + 1) % thread_count == 0:
print('%s Progress: %i/%i' % (info, i + 1, len(links)),
end='\r')
print('')
39 changes: 39 additions & 0 deletions core/mirror.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import os
import re

def mirror(url, response):
if response != 'dummy':
cleanUrl = url.replace('http://', '').replace('https://', '').rstrip('/')
parts = cleanUrl.split('?')[0].split('/')
root = parts[0]
webpage = parts[-1]
parts.remove(root)
try:
parts.remove(webpage)
except ValueError:
pass
prefix = root + '_mirror'
try:
os.mkdir(prefix)
except OSError:
pass
suffix = ''
if parts:
for directory in parts:
suffix += directory + '/'
try:
os.mkdir(prefix + '/' + suffix)
except OSError:
pass
path = prefix + '/' + suffix
trail = ''
if '.' not in webpage:
trail += '.html'
if webpage == root:
name = 'index.html'
else:
name = webpage
if len(url.split('?')) > 1:
trail += '?' + url.split('?')[1]
with open(path + name + trail, 'w+') as out_file:
out_file.write(response.encode('utf-8'))
82 changes: 82 additions & 0 deletions core/requester.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import time
import random
import requests
from requests import get, post
from requests.exceptions import TooManyRedirects

session = requests.Session()
session.max_redirects = 3

def requester(url, main_url=None, delay=0, cook={}, headers={}, timeout=10, host=None, ninja=False, user_agents=['Photon'], failed=[], processed=[]):
"""Handle the requests and return the response body."""
# Mark the URL as crawled
processed.add(url)
# Pause/sleep the program for specified time
time.sleep(delay)

def normal(url):
"""Default request"""
finalHeaders = headers or {
'Host': host,
# Selecting a random user-agent
'User-Agent': random.choice(user_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip',
'DNT': '1',
'Connection': 'close',
}
try:
response = session.get(url, cookies=cook, headers=finalHeaders, verify=False,
timeout=timeout, stream=True)
except TooManyRedirects:
return 'dummy'
if 'text/html' in response.headers['content-type']:
if response.status_code != '404':
return response.text
else:
response.close()
failed.add(url)
return 'dummy'
else:
response.close()
return 'dummy'

def facebook(url):
"""Interact with the developer.facebook.com API."""
return requests.get('https://developers.facebook.com/tools/debug/echo/?q=' + url,
verify=False).text

def pixlr(url):
"""Interact with the pixlr.com API."""
if url == main_url:
# Because pixlr throws error if http://example.com is used
url = main_url + '/'
return requests.get('https://pixlr.com/proxy/?url=' + url,
headers={'Accept-Encoding' : 'gzip'}, verify=False).text

def code_beautify(url):
"""Interact with the codebeautify.org API."""
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0',
'Accept': 'text/plain, */*; q=0.01',
'Accept-Encoding': 'gzip',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Origin': 'https://codebeautify.org',
'Connection': 'close',
}
return requests.post('https://codebeautify.com/URLService', headers=headers,
data='path=' + url, verify=False).text

def photopea(url):
"""Interact with the www.photopea.com API."""
return requests.get(
'https://www.photopea.com/mirror.php?url=' + url, verify=False).text

if ninja: # If the ninja mode is enabled
# Select a random request function i.e. random API
response = random.choice(
[photopea, normal, facebook, pixlr, code_beautify])(url)
return response or 'dummy'
else:
return normal(url)
38 changes: 38 additions & 0 deletions core/updater.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import re
import os
from core.requester import requester
from core.colors import run, que, good, green, end, info

def updater():
"""Update the current installation.
git clones the latest version and merges it with the current directory.
"""
print('%s Checking for updates' % run)
# Changes must be separated by ;
changes = "cloning (mirroring) feature;fixed sitemap.xml parsing;reuse tcp connection to boost speed;handle redirect loops;csv export support;other minor bug fixes"
latest_commit = requester('https://raw.githubusercontent.com/s0md3v/Photon/master/photon.py', host='github.com').text
# Just a hack to see if a new version is available
if changes not in latest_commit:
changelog = re.search(r"changes = '''(.*?)'''", latest_commit)
# Splitting the changes to form a list
changelog = changelog.group(1).split(';')
print('%s A new version of Photon is available.' % good)
print('%s Changes:' % info)
for change in changelog: # print changes
print('%s>%s %s' % (green, end, change))

current_path = os.getcwd().split('/') # if you know it, you know it
folder = current_path[-1] # current directory name
path = '/'.join(current_path) # current directory path
choice = input('%s Would you like to update? [Y/n] ' % que).lower()

if choice != 'n':
print('%s Updating Photon' % run)
os.system('git clone --quiet https://github.com/s0md3v/Photon %s'
% (folder))
os.system('cp -r %s/%s/* %s && rm -r %s/%s/ 2>/dev/null'
% (path, folder, path, path, folder))
print('%s Update successful!' % good)
else:
print('%s Photon is up to date!' % good)
125 changes: 125 additions & 0 deletions core/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import re
import tld
import math
from core.config import verbose, badTypes
from core.colors import info

try:
from urllib.parse import urlparse
except:
from urlparse import urlparse


def regxy(pattern, response, supress_regex, custom):
"""Extract a string based on regex pattern supplied by user."""
try:
matches = re.findall(r'%s' % pattern, response)
for match in matches:
verb('Custom regex', match)
custom.add(match)
except:
supress_regex = True


def is_link(url, processed, files):
"""Check whether an URL should be crawled or not."""
# File extension that don't need to be crawled and are files
# Whether the the url should be crawled or not
conclusion = False
# If the URL hasn't been crawled already
if url not in processed:
if url.split('.')[-1].lower() in badTypes:
files.add(url)
else:
return True
return conclusion


def remove_regex(urls, regex):
"""
Parse a list for non-matches to a regex.
Args:
urls: iterable of urls
custom_regex: string regex to be parsed for
Returns:
list of strings not matching regex
"""

if not regex:
return urls

# To avoid iterating over the characters of a string
if not isinstance(urls, (list, set, tuple)):
urls = [urls]

try:
non_matching_urls = [url for url in urls if not re.search(regex, url)]
except TypeError:
return []

return non_matching_urls

def writer(datasets, dataset_names, output_dir):
"""Write the results."""
for dataset, dataset_name in zip(datasets, dataset_names):
if dataset:
filepath = output_dir + '/' + dataset_name + '.txt'
with open(filepath, 'w+') as out_file:
joined = '\n'.join(dataset)
out_file.write(str(joined.encode('utf-8')))
out_file.write('\n')

def timer(diff, processed):
"""Return the passed time."""
# Changes seconds into minutes and seconds
minutes, seconds = divmod(diff, 60)
try:
# Finds average time taken by requests
time_per_request = diff / float(len(processed))
except ZeroDivisionError:
time_per_request = 0
return minutes, seconds, time_per_request

def entropy(string):
"""Calculate the entropy of a string."""
entropy = 0
for number in range(256):
result = float(string.encode('utf-8').count(
chr(number)))/len(string.encode('utf-8'))
if result != 0:
entropy = entropy - result * math.log(result, 2)
return entropy

def xmlParser(response):
"""Extract links from .xml files."""
# Regex for extracting URLs
return re.findall(r'<loc>(.*?)</loc>', response)

def verb(kind, string):
"""Enable verbose output."""
if verbose:
print('%s %s: %s' % (info, kind, string))

def extract_headers(headers):
"""This function extracts valid headers from interactive input."""
sorted_headers = {}
matches = re.findall(r'(.*):\s(.*)', headers)
for match in matches:
header = match[0]
value = match[1]
try:
if value[-1] == ',':
value = value[:-1]
sorted_headers[header] = value
except IndexError:
pass
return sorted_headers

def top_level(url):
"""Extract the top level domain from an URL."""
ext = tld.get_tld(url, fix_protocol=True)
toplevel = '.'.join(urlparse(url).netloc.split('.')[-2:]).split(
ext)[0] + ext
return toplevel
Loading

0 comments on commit 93df322

Please sign in to comment.