-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpatchloader.py
206 lines (184 loc) · 9.16 KB
/
patchloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# patchloader.py
# PatchLoader class
#
# Jiyong Jang, 2012
#
import os
import re
import time
import mimetypes
import common
class PatchLoader(object):
def __init__(self):
self._patch_list = []
self._npatch = 0
def traverse(self, patch_path):
'''
Traverse patch files
'''
print('[+] traversing patch files')
start_time = time.time()
if os.path.isfile(patch_path):
magic_type = common.file_type(patch_path)
common.verbose_print(' [-] %s: %s' % (patch_path, magic_type))
if magic_type.startswith('text'):
main_type, sub_type = magic_type.split('/')
self._process(patch_path)
elif os.path.isdir(patch_path):
for root,dirs,files in os.walk(patch_path):
for file in files:
file_path = os.path.join(root, file)
magic_type = common.file_type(file_path)
common.verbose_print(' [-] %s: %s' % (file_path, magic_type))
if magic_type.startswith('text'):
main_type, sub_type = magic_type.split('/')
self._process(file_path)
self._npatch = len(self._patch_list)
elapsed_time = time.time() - start_time
print('[+] %d patches ... %.1fs\n' % (self._npatch, elapsed_time))
return self._npatch
def _process(self, patch_path):
'''
Normalize a patch file and build a hash list
'''
patch_filename = patch_path.split('/')[-1]
patch_file = open(patch_path, 'r')
patch_lines = patch_file.readlines()
patch_file.close()
magic_ext = None
process_flag = False
diff_file = ''
diff_cnt = 0
diff_vuln_lines = []
diff_orig_lines = []
for line in patch_lines:
if line.startswith('--- '):
if diff_vuln_lines:
diff_norm_lines = self._normalize(''.join(diff_vuln_lines), magic_ext).split()
if len(diff_norm_lines) >= common.ngram_size:
common.verbose_print(' [-] %s %d (ext: %d)' % (diff_file, diff_cnt, magic_ext))
path = '[%s] %s #%d' % (patch_filename, diff_file, diff_cnt)
hash_list = self._build_hash_list(diff_norm_lines)
self._patch_list.append(common.PatchInfo(path, magic_ext, ''.join(diff_orig_lines), diff_norm_lines, hash_list))
else:
common.verbose_print(' [-] %s %d (ext: %d) - skipped (%d lines)' % (diff_file, diff_cnt, magic_ext, len(diff_norm_lines)))
del diff_vuln_lines[:]
del diff_orig_lines[:]
diff_path = line.split()[1]
if diff_path == '/dev/null':
process_flag = False
else:
process_flag = True
diff_cnt = 0
diff_file = diff_path.split('/')[-1]
magic_ext = self._get_file_type(diff_file)
elif process_flag:
# exclude the line '--' at the end of `git format-patch -1`
if line.startswith('--'):
continue
if line.startswith('+++ '):
diff_path = line.split()[1]
if diff_path == '/dev/null':
process_flag = False
elif line.startswith('@@'):
if diff_vuln_lines:
diff_norm_lines = self._normalize(''.join(diff_vuln_lines), magic_ext).split()
if len(diff_norm_lines) >= common.ngram_size:
common.verbose_print(' [-] %s %d (ext: %d)' % (diff_file, diff_cnt, magic_ext))
path = '[%s] %s #%d' % (patch_filename, diff_file, diff_cnt)
hash_list = self._build_hash_list(diff_norm_lines)
self._patch_list.append(common.PatchInfo(path, magic_ext, ''.join(diff_orig_lines), diff_norm_lines, hash_list))
else:
common.verbose_print(' [-] %s %d (ext: %d) - skipped (%d lines)' % (diff_file, diff_cnt, magic_ext, len(diff_norm_lines)))
del diff_vuln_lines[:]
del diff_orig_lines[:]
diff_cnt += 1
elif line.startswith('-'):
diff_vuln_lines.append(line[1:])
diff_orig_lines.append('<font color=\"#AA0000\">')
diff_orig_lines.append(line.replace('<','<').replace('>','>'))
diff_orig_lines.append('</font>')
elif line.startswith('+'):
diff_orig_lines.append('<font color=\"#00AA00\">')
diff_orig_lines.append(line.replace('<','<').replace('>','>'))
diff_orig_lines.append('</font>')
elif line.startswith(' '):
diff_vuln_lines.append(line[1:])
diff_orig_lines.append(line.replace('<','<').replace('>','>'))
if diff_vuln_lines:
diff_norm_lines = self._normalize(''.join(diff_vuln_lines), magic_ext).split()
if len(diff_norm_lines) >= common.ngram_size:
common.verbose_print(' [-] %s %d (ext: %d)' % (diff_file, diff_cnt, magic_ext))
path = '[%s] %s #%d' % (patch_filename, diff_file, diff_cnt)
hash_list = self._build_hash_list(diff_norm_lines)
self._patch_list.append(common.PatchInfo(path, magic_ext, ''.join(diff_orig_lines), diff_norm_lines, hash_list))
else:
common.verbose_print(' [-] %s %d (ext: %d) - skipped (%d lines)' % (diff_file, diff_cnt, magic_ext, len(diff_norm_lines)))
def _normalize(self, patch, ext):
'''
Normalize a patch file
'''
# Language-specific optimization
if ext==common.FileExt.C or ext==common.FileExt.Java:
patch = ''.join([c.group('noncomment') for c in common.c_regex.finditer(patch) if c.group('noncomment')])
patch = ''.join([c.group('noncomment') for c in common.c_partial_comment_regex.finditer(patch) if c.group('noncomment')])
elif ext==common.FileExt.ShellScript or ext==common.FileExt.Python:
patch = ''.join([c.group('noncomment') for c in common.shellscript_regex.finditer(patch) if c.group('noncomment')])
elif ext==common.FileExt.Perl:
patch = ''.join([c.group('noncomment') for c in common.perl_regex.finditer(patch) if c.group('noncomment')])
elif ext==common.FileExt.PHP:
patch = ''.join([c.group('noncomment') for c in common.php_regex.finditer(patch) if c.group('noncomment')])
patch = ''.join([c.group('noncomment') for c in common.c_partial_comment_regex.finditer(patch) if c.group('noncomment')])
elif ext==common.FileExt.Ruby:
patch = ''.join([c.group('noncomment') for c in common.ruby_regex.finditer(patch) if c.group('noncomment')])
patch = ''.join([c.group('noncomment') for c in common.ruby_partial_comment_regex.finditer(patch) if c.group('noncomment')])
# Remove whitespaces except newlines
patch = common.whitespaces_regex.sub("", patch)
# Convert into lowercases
return patch.lower()
def _build_hash_list(self, diff_norm_lines):
'''
Build a hash list
'''
hash_list = []
num_ngram = len(diff_norm_lines) - common.ngram_size + 1
for i in range(0, num_ngram):
ngram = ''.join(diff_norm_lines[i:i+common.ngram_size])
hash1 = common.fnv1a_hash(ngram) & (common.bloomfilter_size-1)
hash2 = common.djb2_hash(ngram) & (common.bloomfilter_size-1)
hash3 = common.sdbm_hash(ngram) & (common.bloomfilter_size-1)
hash_list.append(hash1)
hash_list.append(hash2)
hash_list.append(hash3)
return hash_list
def _get_file_type(self, file_path):
'''
Guess a file type based upon a file extension (mimetypes module)
'''
file_type, encoding = mimetypes.guess_type(file_path)
magic_ext = None
if file_type is None:
magic_ext = common.FileExt.Text
else:
main_type, sub_type = file_type.split('/')
if sub_type.startswith('x-c'):
magic_ext = common.FileExt.C
elif sub_type == 'x-java':
magic_ext = common.FileExt.Java
elif sub_type == 'x-sh':
magic_ext = common.FileExt.ShellScript
elif sub_type == 'x-perl':
magic_ext = common.FileExt.Perl
elif sub_type == 'x-python':
magic_ext = common.FileExt.Python
elif sub_type == 'x-httpd-php':
magic_ext = common.FileExt.PHP
elif sub_type == 'x-ruby':
magic_ext = common.FileExt.Ruby
else:
magic_ext = common.FileExt.Text
return magic_ext
def items(self):
return self._patch_list
def length(self):
return self._npatch