-
-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Modular structure & Cloning Ability (v1.2.1)
- Loading branch information
Showing
9 changed files
with
447 additions
and
379 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import sys | ||
|
||
colors = True # Output should be colored | ||
machine = sys.platform # Detecting the os of current system | ||
if machine.lower().startswith(('os', 'win', 'darwin', 'ios')): | ||
colors = False # Colors shouldn't be displayed in mac & windows | ||
if not colors: | ||
end = red = white = green = yellow = run = bad = good = info = que = '' | ||
else: | ||
white = '\033[97m' | ||
green = '\033[92m' | ||
red = '\033[91m' | ||
yellow = '\033[93m' | ||
end = '\033[0m' | ||
back = '\033[7;91m' | ||
info = '\033[93m[!]\033[0m' | ||
que = '\033[94m[?]\033[0m' | ||
bad = '\033[91m[-]\033[0m' | ||
good = '\033[92m[+]\033[0m' | ||
run = '\033[97m[~]\033[0m' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,7 @@ | ||
"""Configuration options for Photon.""" | ||
|
||
verbose = False | ||
|
||
intels = [ | ||
'facebook.com', | ||
'github.com', | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
from __future__ import print_function | ||
|
||
import sys | ||
from core.colors import info | ||
|
||
try: | ||
import concurrent.futures | ||
except ImportError: | ||
import threading | ||
|
||
def threader(function, *urls): | ||
"""Start multiple threads for a function.""" | ||
threads = [] | ||
# Because URLs is a tuple | ||
urls = urls[0] | ||
# Iterating over URLs | ||
for url in urls: | ||
task = threading.Thread(target=function, args=(url,)) | ||
threads.append(task) | ||
# Start threads | ||
for thread in threads: | ||
thread.start() | ||
# Wait for all threads to complete their work | ||
for thread in threads: | ||
thread.join() | ||
# Delete threads | ||
del threads[:] | ||
|
||
|
||
def flash(function, links, thread_count): | ||
"""Process the URLs and uses a threadpool to execute a function.""" | ||
# Convert links (set) to list | ||
links = list(links) | ||
if sys.version_info < (3, 2): | ||
for begin in range(0, len(links), thread_count): # Range with step | ||
end = begin + thread_count | ||
splitted = links[begin:end] | ||
threader(function, splitted) | ||
progress = end | ||
if progress > len(links): # Fix if overflow | ||
progress = len(links) | ||
print('\r%s Progress: %i/%i' % (info, progress, len(links)), | ||
end='\r') | ||
sys.stdout.flush() | ||
else: | ||
threadpool = concurrent.futures.ThreadPoolExecutor( | ||
max_workers=thread_count) | ||
futures = (threadpool.submit(function, link) for link in links) | ||
for i, _ in enumerate(concurrent.futures.as_completed(futures)): | ||
if i + 1 == len(links) or (i + 1) % thread_count == 0: | ||
print('%s Progress: %i/%i' % (info, i + 1, len(links)), | ||
end='\r') | ||
print('') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import os | ||
import re | ||
|
||
def mirror(url, response): | ||
if response != 'dummy': | ||
cleanUrl = url.replace('http://', '').replace('https://', '').rstrip('/') | ||
parts = cleanUrl.split('?')[0].split('/') | ||
root = parts[0] | ||
webpage = parts[-1] | ||
parts.remove(root) | ||
try: | ||
parts.remove(webpage) | ||
except ValueError: | ||
pass | ||
prefix = root + '_mirror' | ||
try: | ||
os.mkdir(prefix) | ||
except OSError: | ||
pass | ||
suffix = '' | ||
if parts: | ||
for directory in parts: | ||
suffix += directory + '/' | ||
try: | ||
os.mkdir(prefix + '/' + suffix) | ||
except OSError: | ||
pass | ||
path = prefix + '/' + suffix | ||
trail = '' | ||
if '.' not in webpage: | ||
trail += '.html' | ||
if webpage == root: | ||
name = 'index.html' | ||
else: | ||
name = webpage | ||
if len(url.split('?')) > 1: | ||
trail += '?' + url.split('?')[1] | ||
with open(path + name + trail, 'w+') as out_file: | ||
out_file.write(response.encode('utf-8')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
import time | ||
import random | ||
import requests | ||
from requests import get, post | ||
from requests.exceptions import TooManyRedirects | ||
|
||
session = requests.Session() | ||
session.max_redirects = 3 | ||
|
||
def requester(url, main_url=None, delay=0, cook={}, headers={}, timeout=10, host=None, ninja=False, user_agents=['Photon'], failed=[], processed=[]): | ||
"""Handle the requests and return the response body.""" | ||
# Mark the URL as crawled | ||
processed.add(url) | ||
# Pause/sleep the program for specified time | ||
time.sleep(delay) | ||
|
||
def normal(url): | ||
"""Default request""" | ||
finalHeaders = headers or { | ||
'Host': host, | ||
# Selecting a random user-agent | ||
'User-Agent': random.choice(user_agents), | ||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | ||
'Accept-Language': 'en-US,en;q=0.5', | ||
'Accept-Encoding': 'gzip', | ||
'DNT': '1', | ||
'Connection': 'close', | ||
} | ||
try: | ||
response = session.get(url, cookies=cook, headers=finalHeaders, verify=False, | ||
timeout=timeout, stream=True) | ||
except TooManyRedirects: | ||
return 'dummy' | ||
if 'text/html' in response.headers['content-type']: | ||
if response.status_code != '404': | ||
return response.text | ||
else: | ||
response.close() | ||
failed.add(url) | ||
return 'dummy' | ||
else: | ||
response.close() | ||
return 'dummy' | ||
|
||
def facebook(url): | ||
"""Interact with the developer.facebook.com API.""" | ||
return requests.get('https://developers.facebook.com/tools/debug/echo/?q=' + url, | ||
verify=False).text | ||
|
||
def pixlr(url): | ||
"""Interact with the pixlr.com API.""" | ||
if url == main_url: | ||
# Because pixlr throws error if http://example.com is used | ||
url = main_url + '/' | ||
return requests.get('https://pixlr.com/proxy/?url=' + url, | ||
headers={'Accept-Encoding' : 'gzip'}, verify=False).text | ||
|
||
def code_beautify(url): | ||
"""Interact with the codebeautify.org API.""" | ||
headers = { | ||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0', | ||
'Accept': 'text/plain, */*; q=0.01', | ||
'Accept-Encoding': 'gzip', | ||
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', | ||
'Origin': 'https://codebeautify.org', | ||
'Connection': 'close', | ||
} | ||
return requests.post('https://codebeautify.com/URLService', headers=headers, | ||
data='path=' + url, verify=False).text | ||
|
||
def photopea(url): | ||
"""Interact with the www.photopea.com API.""" | ||
return requests.get( | ||
'https://www.photopea.com/mirror.php?url=' + url, verify=False).text | ||
|
||
if ninja: # If the ninja mode is enabled | ||
# Select a random request function i.e. random API | ||
response = random.choice( | ||
[photopea, normal, facebook, pixlr, code_beautify])(url) | ||
return response or 'dummy' | ||
else: | ||
return normal(url) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import re | ||
import os | ||
from core.requester import requester | ||
from core.colors import run, que, good, green, end, info | ||
|
||
def updater(): | ||
"""Update the current installation. | ||
git clones the latest version and merges it with the current directory. | ||
""" | ||
print('%s Checking for updates' % run) | ||
# Changes must be separated by ; | ||
changes = "cloning (mirroring) feature;fixed sitemap.xml parsing;reuse tcp connection to boost speed;handle redirect loops;csv export support;other minor bug fixes" | ||
latest_commit = requester('https://raw.githubusercontent.com/s0md3v/Photon/master/photon.py', host='github.com').text | ||
# Just a hack to see if a new version is available | ||
if changes not in latest_commit: | ||
changelog = re.search(r"changes = '''(.*?)'''", latest_commit) | ||
# Splitting the changes to form a list | ||
changelog = changelog.group(1).split(';') | ||
print('%s A new version of Photon is available.' % good) | ||
print('%s Changes:' % info) | ||
for change in changelog: # print changes | ||
print('%s>%s %s' % (green, end, change)) | ||
|
||
current_path = os.getcwd().split('/') # if you know it, you know it | ||
folder = current_path[-1] # current directory name | ||
path = '/'.join(current_path) # current directory path | ||
choice = input('%s Would you like to update? [Y/n] ' % que).lower() | ||
|
||
if choice != 'n': | ||
print('%s Updating Photon' % run) | ||
os.system('git clone --quiet https://github.com/s0md3v/Photon %s' | ||
% (folder)) | ||
os.system('cp -r %s/%s/* %s && rm -r %s/%s/ 2>/dev/null' | ||
% (path, folder, path, path, folder)) | ||
print('%s Update successful!' % good) | ||
else: | ||
print('%s Photon is up to date!' % good) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
import re | ||
import tld | ||
import math | ||
from core.config import verbose, badTypes | ||
from core.colors import info | ||
|
||
try: | ||
from urllib.parse import urlparse | ||
except: | ||
from urlparse import urlparse | ||
|
||
|
||
def regxy(pattern, response, supress_regex, custom): | ||
"""Extract a string based on regex pattern supplied by user.""" | ||
try: | ||
matches = re.findall(r'%s' % pattern, response) | ||
for match in matches: | ||
verb('Custom regex', match) | ||
custom.add(match) | ||
except: | ||
supress_regex = True | ||
|
||
|
||
def is_link(url, processed, files): | ||
"""Check whether an URL should be crawled or not.""" | ||
# File extension that don't need to be crawled and are files | ||
# Whether the the url should be crawled or not | ||
conclusion = False | ||
# If the URL hasn't been crawled already | ||
if url not in processed: | ||
if url.split('.')[-1].lower() in badTypes: | ||
files.add(url) | ||
else: | ||
return True | ||
return conclusion | ||
|
||
|
||
def remove_regex(urls, regex): | ||
""" | ||
Parse a list for non-matches to a regex. | ||
Args: | ||
urls: iterable of urls | ||
custom_regex: string regex to be parsed for | ||
Returns: | ||
list of strings not matching regex | ||
""" | ||
|
||
if not regex: | ||
return urls | ||
|
||
# To avoid iterating over the characters of a string | ||
if not isinstance(urls, (list, set, tuple)): | ||
urls = [urls] | ||
|
||
try: | ||
non_matching_urls = [url for url in urls if not re.search(regex, url)] | ||
except TypeError: | ||
return [] | ||
|
||
return non_matching_urls | ||
|
||
def writer(datasets, dataset_names, output_dir): | ||
"""Write the results.""" | ||
for dataset, dataset_name in zip(datasets, dataset_names): | ||
if dataset: | ||
filepath = output_dir + '/' + dataset_name + '.txt' | ||
with open(filepath, 'w+') as out_file: | ||
joined = '\n'.join(dataset) | ||
out_file.write(str(joined.encode('utf-8'))) | ||
out_file.write('\n') | ||
|
||
def timer(diff, processed): | ||
"""Return the passed time.""" | ||
# Changes seconds into minutes and seconds | ||
minutes, seconds = divmod(diff, 60) | ||
try: | ||
# Finds average time taken by requests | ||
time_per_request = diff / float(len(processed)) | ||
except ZeroDivisionError: | ||
time_per_request = 0 | ||
return minutes, seconds, time_per_request | ||
|
||
def entropy(string): | ||
"""Calculate the entropy of a string.""" | ||
entropy = 0 | ||
for number in range(256): | ||
result = float(string.encode('utf-8').count( | ||
chr(number)))/len(string.encode('utf-8')) | ||
if result != 0: | ||
entropy = entropy - result * math.log(result, 2) | ||
return entropy | ||
|
||
def xmlParser(response): | ||
"""Extract links from .xml files.""" | ||
# Regex for extracting URLs | ||
return re.findall(r'<loc>(.*?)</loc>', response) | ||
|
||
def verb(kind, string): | ||
"""Enable verbose output.""" | ||
if verbose: | ||
print('%s %s: %s' % (info, kind, string)) | ||
|
||
def extract_headers(headers): | ||
"""This function extracts valid headers from interactive input.""" | ||
sorted_headers = {} | ||
matches = re.findall(r'(.*):\s(.*)', headers) | ||
for match in matches: | ||
header = match[0] | ||
value = match[1] | ||
try: | ||
if value[-1] == ',': | ||
value = value[:-1] | ||
sorted_headers[header] = value | ||
except IndexError: | ||
pass | ||
return sorted_headers | ||
|
||
def top_level(url): | ||
"""Extract the top level domain from an URL.""" | ||
ext = tld.get_tld(url, fix_protocol=True) | ||
toplevel = '.'.join(urlparse(url).netloc.split('.')[-2:]).split( | ||
ext)[0] + ext | ||
return toplevel |
Oops, something went wrong.