-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathkathe-cli.py
executable file
·426 lines (371 loc) · 17.7 KB
/
kathe-cli.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# XXX replace 'file' with 'input', unless it is about files.
from optparse import OptionParser
from datetime import datetime
import csv
import hashlib
import json
import os
import sys
import unicodedata
import secrets
import requests
try:
import redis
except ImportError:
# for now I simply pin this to the ancient version
print('sudo -HE pip install redis==2.10.6')
exit(1)
try:
import ssdeep
except ImportError:
"""
if you get errors during the installation process, install these:
sudo apt-get install python3 python-dev python3-dev build-essential libssl-dev
libffi-dev libxml2-dev libxslt1-dev zlib1g-dev python-pip libfuzzy-dev
"""
print('pip install ssdeep')
# print('apt install python3-ssdeep')
exit(1)
# XXX clean up order of functions etc, do __main__ to prevent cli stuff
parser = OptionParser()
parser.add_option("-c", "--context", dest="context", action='store',
type='string',
help=("context (comma separated)."
"E.g.: 'win.isfb,malpedia,2018,2018-01,2018-01-01' "
"Make sure the most important context "
"(in this example the malware family) "
"is the first one in the list."),
metavar="REQUIRED")
parser.add_option("-r", "--redisdb", dest="redisdb", action="store",
type='int',
help='select the redisdb #. to store in. defaults to 13',
metavar="13")
parser.add_option("-f", "--file", dest="filename", action='store',
type='string', help="analyse a file.", metavar="FILE")
parser.add_option("-i", "--csv", dest="csvfile", action='store',
type='string', help="csv with headers ssdeep,sha256,inputname,context0 (primary context),context1.", metavar="FILE")
parser.add_option("-j", "--json", dest="jason", action='store', type='string',
help=("use json formatted strings (per line) as source:"
""" ["ssdeephash","name","sha256"] """
"""like: cat json|while read line;"""
"""do ./kathe.py -c context -j "${line}";done"""),
metavar="JSON input")
parser.add_option("-a", "--add", action="store_true",
help=f"use REST interface: {secrets.kathe_add_endpoint}")
(options, args) = parser.parse_args()
# Ugly way to check you are actually giving us something to work with.
if options.filename is None and options.jason is None\
and options.csvfile is None or options.context is None:
print(parser.error("Missing options, see " + sys.argv[0] + " -h"))
# To start with, set all to None.
inputname = None
inputssdeep = None
inputsha256 = None
if options.context and len(options.context) != 0:
inputcontext = options.context
else:
exit('Setting a context is required')
inputcontext = 'None'
# By default, store in Redis db 13.
if options.redisdb and options.add:
exit("Choose either direct redis storing, or using a REST add interface")
# waitwut?!
if options.redisdb and options.redisdb:
if int(options.redisdb) < 17 and int(options.redisdb) > 0:
redisdbnr = options.redisdb
try:
redis_password = secrets.redis_password
print(redis_password)
except KeyError as e:
print(e)
redis_password = ''
# Connect to redis.
# Also, convert all responses to strings, not bytes
r = redis.StrictRedis('localhost', 6379, db=redisdbnr, password=redis_password, charset="utf-8",
decode_responses=True)
try:
r.info()
except redis.exceptions.ResponseError as e:
exit(e)
else:
exit("Give us a real redis db number to work with")
else:
exit("-r option is required")
# connect to neo4j.
# graph = Graph(secrets.neo4jurl, auth=(secrets.neo4juser, secrets.neo4jpass))
def timestamp():
ts = int(datetime.now().strftime("%s") + str(datetime.now().microsecond).zfill(6))
return ts
def remove_control_characters(s):
"""Some input (like filenames) has some really nasty control chars.
This trick removes those (https://stackoverflow.com/a/19016117)"""
return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C")
def replace_badchars(inputstring):
"""Stringing together '.replace' seems the fastest way
to do this: https://stackoverflow.com/a/27086669"""
blacklist = {':': '', '\\': '', '"': '', '\'': '', '|': '',
' ': '', '/': ''}
for k in blacklist:
inputstring = inputstring.replace(k, blacklist[k])
return inputstring
def clean_context(contextstring):
"""Remove all troublesome characters from the context option.
We need to do this to make splitting the strings by
other tools reliable."""
clean_contextstring = replace_badchars(contextstring)
# make string splitable on pipe symbol and turn to lowercase
clean_contextstring = clean_contextstring.encode('utf-8', 'ignore')
clean_contextstring = clean_contextstring.decode('utf-8', 'ignore')
clean_contextstring = clean_contextstring.replace(',', '|').lower()
clean_contextstring = remove_control_characters(clean_contextstring)
return clean_contextstring
def clean_name(filename):
"""Remove pathname from the input and characters
which could cause issues with stringparsing.
"""
# XXX in the case of directories, we'd want dirnames etc.
cleanname = os.path.basename(filename)
cleanname = replace_badchars(cleanname)
cleanname = cleanname.encode('utf-8', 'ignore').decode('utf-8', 'ignore')
cleanname = remove_control_characters(cleanname)
cleanname = cleanname.replace(',', '|').lower()
return (cleanname)
# buffered file reading sha256
def file_sha256(inputname):
"""returns the sha256 hash of a file buffered,
so memory isn't swamped when dealing with large files."""
h = hashlib.sha256()
with open(inputname, 'rb', buffering=0) as f:
for b in iter(lambda: f.read(128 * 1024), b''):
h.update(b)
return h.hexdigest()
# buffered file reading ssdeep
def file_ssdeep(inputname):
"""returns the ssdeep hash of a file buffered,
so memory isn't swamped when dealing with large files."""
h = ssdeep.Hash()
with open(inputname, 'rb', buffering=0) as f:
for b in iter(lambda: f.read(128 * 1024), b''):
h.update(b)
return h.digest()
# The below two functions (preprocess_ssdeep and get_all_7_char_rolling_window)
# originally come from Brian Wallace:
# https://www.virusbulletin.com/virusbulletin/2015/11/\
# optimizing-ssdeep-use-scale
def get_all_7_char_rolling_window(bs, h):
"""return a set containing the 7 character length strings (rolling window)
of the ssdeep string for both block sizes, with the block size prepended.
Ssdeep only does a compare if at least 7 characters match between strings.
These are the keys which hold the sibling values."""
return set((str(bs) + ":" + h[i:i + 7]) for i in range(len(h) - 6))
def preprocess_ssdeep(h):
"""The ssdeep string is split into block_size, ssdeep, ssdeep_double_block.
Before returning a set of all the rolling_window for size and double size,
all the repeated character sequences of more than 3 are reduced to max 3.
This is something the ssdeep algoritm does internally too.
"""
h_rolling_window = set()
block_size, h = h.split(":", 1)
block_size = int(block_size)
# Reduce any sequence of the same char greater than 3 to 3
for c in set(list(h)):
while c * 4 in h:
h = h.replace(c * 4, c * 3)
block_data, double_block_data = h.split(":")
h_rolling_window.update(get_all_7_char_rolling_window(block_size,
block_data))
h_rolling_window.update(get_all_7_char_rolling_window(block_size * 2,
double_block_data))
return h_rolling_window
def get_ssdeep_sets(rolling_window_ssdeep, inputssdeep):
""" create a set of ssdeep hashes matching filesssdeep
from the rolling_window set, which does not contain
inputssdeep hash itself. Using '.discard' to silently
return without inputssdeep."""
siblings_set = r.smembers(rolling_window_ssdeep)
siblings_set.discard(inputssdeep)
return siblings_set
def add_ssdeep_to_rolling_window(rolling_window_ssdeep, inputssdeep):
"""This function adds the inputssdeep hash to all the matching
rolling_windows."""
r.sadd(rolling_window_ssdeep, inputssdeep)
def add_info(inputname, inputsha256, inputssdeep, inputcontext):
"""The four info fields contain a set (read: unique) of information
about the added entity. This way sha256/inputname/inputssdeep are
linked and retrievable."""
inputcontext = clean_context(inputcontext)
splitcontext = inputcontext.split('|')
r.sadd('info:inputname:{}'.format(inputname),
'sha256:{}:ssdeep:{}:context:{}'.format(inputsha256,
inputssdeep,
inputcontext))
r.sadd('info:ssdeep:{}'.format(inputssdeep),
'sha256:{}:context:{}:inputname:{}'.format(inputsha256,
inputcontext,
inputname))
r.sadd('info:sha256:{}'.format(inputsha256),
'ssdeep:{}:context:{}:inputname:{}'.format(inputssdeep,
inputcontext,
inputname))
r.sadd("hashes:ssdeep", '{}'.format(inputssdeep))
r.sadd("names:inputname", '{}'.format(inputname))
# pull all most significant contexts from an ssdeep and, if they are
# different, add the combined names to splitcontext for inclusion in
# "names:context".
# Because the ssdeeps are similar, this will make different naming
# schemes explicit.
for contexts in r.smembers('info:ssdeep:{}'.format(inputssdeep)):
context = contexts.split(':')[3].split('|')[0]
if context != splitcontext[0]:
context = '/'.join(sorted([context, splitcontext[0]]))
splitcontext.append(context)
for singlecontext in splitcontext:
# add unique key to set with 'incr 1' to keep track of occurance
# and create a ranked set. Rank may chance over time, but that
# is not a problem when updates do not happen inbetween calls
# newer versions need this reverved
r.zincrby("names:context", '{}'.format(singlecontext), amount=1)
# r.zincrby("names:context", 1, '{}'.format(singlecontext))
info_string = 'sha256:{}:ssdeep:{}:inputname:{}:inputcontext:{}'
r.sadd('info:context:{}'.format(singlecontext),
info_string.format(inputsha256,
inputssdeep, inputname, inputcontext))
# timestamp is used for caching of query results. It is updated after
# every addition so it never goes stale.
print(timestamp())
r.set("timestamp", timestamp())
print(r.get("timestamp"))
def get_allsha256_for_ssdeep(ssdeep):
"""function which retrieves a string of unique sha256 hashes for
an ssdeep hash. Theoretically a single ssdeep hash could match multiple
different files, if the differences are insignificant."""
allsha256s = [allsha256.split(':')[1]
for allsha256 in r.smembers('info:ssdeep:{}'.format(ssdeep))]
allsha256s = str.join(':', set(allsha256s))
# print(allsha256s)
return allsha256s
def get_allcontext_for_ssdeep(ssdeep):
"""function which retrieves a string of unique context strings for
an ssdeep hash. Theoretically a single ssdeep hash could match multiple
different contexts, based on how they are added to the dataset."""
allcontexts = [allcontext.split(':')[3]
for allcontext in
r.smembers('info:ssdeep:{}'.format(ssdeep))]
allcontexts = str.join(':', set(allcontexts))
# print(allcontexts)
return allcontexts
def return_results(inputname, inputsha256, inputssdeep, inputcontext):
"""The results should be in json. But the json.dumps function
cannot deal with python sets, so we turn them into lists.
additionally we retrieve other files with the same sha256 and,
last but not least, it siblings (partially matching ssdeep hashes)."""
info = dict()
info['inputname'] = inputname
info['sha256'] = inputsha256
info['ssdeep'] = inputssdeep
info['context'] = inputcontext
info['other_inputnames'] = [inputnames.split(':')[-1]
for inputnames in
r.smembers('info:sha256:{}'.format(inputsha256))
if inputnames.split(':')[-1] not in inputname]
info['siblings'] = list(r.zrangebyscore(inputssdeep, min=0,
max='+inf', withscores=True))
return(info)
def new_hash(inputsha256):
""" To speed things up, we take a different path if the file is already known.
return True if new, False if the hash is already known."""
if r.sismember("hashes:sha256", '{}'.format(inputsha256)):
new = False
else:
new = True
return new
def add_ssdeep_to_db(inputname, inputsha256, inputssdeep, inputcontext):
""" This function either adds the data directly to a redis instance,
or, uses the REST interface to fill a remote instance.
All the logic to store directly is contained in this function. If a sha256
is already known, just store the extra info (saves a lot of effort).
"""
# Don't bother with a local redis instance
if options.add:
proxies = {}
kathe_add_endpoint = secrets.kathe_add_endpoint
payload = {'info': [{'contexts': inputcontext.split(','),
'inputname': inputname,
'sha256': inputsha256,
'ssdeep': inputssdeep}]}
print(payload)
postdata = requests.post(kathe_add_endpoint, json=payload, proxies=proxies)
print(postdata.headers, postdata.status_code)
# Here we start directly talking to redis
# If the file is new, add all information
else:
if new_hash(inputsha256):
inputname = clean_name(inputname)
r.sadd("hashes:sha256", '{}'.format(inputsha256))
add_info(inputname, inputsha256, inputssdeep, inputcontext)
ssdeep_compare = preprocess_ssdeep(inputssdeep)
for rolling_window_ssdeep in ssdeep_compare:
ssdeep_sets = get_ssdeep_sets(rolling_window_ssdeep, inputssdeep)
add_ssdeep_to_rolling_window(rolling_window_ssdeep, inputssdeep)
for sibling_ssdeep in ssdeep_sets:
# Add sibling_ssdeep to the inputssdeep
# XXX maybe add zscore check to optimise away the compare
st = '{},{},{}'
r.zadd(inputssdeep,
float(ssdeep.compare(sibling_ssdeep, inputssdeep)),
st.format(sibling_ssdeep,
get_allsha256_for_ssdeep(sibling_ssdeep),
get_allcontext_for_ssdeep(sibling_ssdeep)))
# Add inputssdeep to sibling_ssdeep
r.zadd(sibling_ssdeep,
float(ssdeep.compare(inputssdeep, sibling_ssdeep)),
st.format(inputssdeep,
inputsha256,
inputcontext.replace(',', '|')))
# or else, add only the new info
else:
inputname = clean_name(inputname)
add_info(inputname, inputsha256, inputssdeep, inputcontext)
# return the result in json format if verbose is set
# def add_ssdeep_to_graph(inputname, inputsha256, inputssdeep, inputcontext):
# call functions to get hashes and metainfo
if options.filename:
inputname = options.filename
inputsha256 = file_sha256('{}'.format(inputname))
inputssdeep = file_ssdeep('{}'.format(inputname))
add_ssdeep_to_db(inputname, inputsha256, inputssdeep, inputcontext)
elif options.jason:
jasonstring = options.jason
jasoninfo = json.loads(jasonstring)
inputssdeep = jasoninfo[0]
inputname = jasoninfo[1]
inputsha256 = jasoninfo[2]
add_ssdeep_to_db(inputname, inputsha256, inputssdeep, inputcontext)
elif options.csvfile:
""" this function expects a csv file with headers:
ssdeep,sha256,inputname,context0 (most important context), context1.
Additionally, a context should be given on the cli (-c)"""
csvfile = options.csvfile
with open(csvfile) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
csvcontext = []
if row['ssdeep'] and row['sha256']:
if row['inputname'] == '':
row['inputname'] = row['sha256']
if row['context0'] == '':
row['context0'] = 'unknown_type'
csvcontext.append(clean_context(row['context0']))
if row['context1'] != '':
csvcontext[0:0] = [clean_context(row['context1'])]
clicontexts = clean_context(options.context).split('|')
for clicontext in clicontexts:
if clicontext != '':
csvcontext.append(clicontext)
inputcontext = ','.join(csvcontext)
add_ssdeep_to_db(clean_name(row['inputname']),
row['sha256'], row['ssdeep'], inputcontext)
print(clean_name(row['inputname']),
row['sha256'], row['ssdeep'], inputcontext)