-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathattack_clarifai.py
232 lines (191 loc) · 9.17 KB
/
attack_clarifai.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
from clarifai.rest import ClarifaiApp
import matplotlib.image as mpimg
import numpy as np
from clarifai.rest import Image as ClImage
import time
import argparse
import StringIO
def dict_reader(concepts_list, preds_array):
if args.target_model == 'moderation':
preds_array[0]=filter(lambda concept: concept['name'] == 'safe', concepts_list)[0]['value']
preds_array[1]=filter(lambda concept: concept['name'] == 'suggestive', concepts_list)[0]['value']
preds_array[2]=filter(lambda concept: concept['name'] == 'explicit', concepts_list)[0]['value']
preds_array[3]=filter(lambda concept: concept['name'] == 'drug', concepts_list)[0]['value']
preds_array[4]=filter(lambda concept: concept['name'] == 'gore', concepts_list)[0]['value']
elif args.target_model == 'nsfw-v1.0':
preds_array[0]=filter(lambda concept: concept['name'] == 'sfw', concepts_list)[0]['value']
preds_array[1]=filter(lambda concept: concept['name'] == 'nsfw', concepts_list)[0]['value']
return preds_array
def nsfw_dict_reader(concepts_list, preds_array):
preds_array[0]=filter(lambda concept: concept['name'] == 'sfw', concepts_list)[0]['value']
preds_array[1]=filter(lambda concept: concept['name'] == 'nsfw', concepts_list)[0]['value']
def CW_est_batch(pred_plus_batch, pred_minus_batch, curr_target, max_index):
logit_plus = np.log(pred_plus_batch)
logit_plus_t = logit_plus[:, curr_target]
logit_plus_max = logit_plus[:, max_index]
logit_minus = np.log(pred_minus_batch)
logit_minus_t = logit_minus[:, curr_target]
logit_minus_max = logit_minus[:, max_index]
logit_t_grad_est = (logit_plus_t - logit_minus_t) / delta / 2.0
logit_max_grad_est = (logit_plus_max - logit_minus_max) / delta / 2.0
return logit_max_grad_est - logit_t_grad_est
def xent_est_batch(pred_plus_batch, pred_minus_batch, curr_target):
pred_plus_t = pred_plus_batch[:, curr_target]
pred_minus_t = pred_minus_batch[:, curr_target]
return (pred_plus_t - pred_minus_t) / delta / 2.0
def finite_diff_method(curr_sample, curr_target, p_t, max_index, U=None):
# Randomly assign groups of group_size
random_indices = np.random.permutation(dim)
num_groups = dim / group_size
print ('Num_groups: {}'.format(num_groups))
group_indices = np.array_split(random_indices, num_groups)
buffers = []
for j in range(num_groups):
# Create a perturbation for this group
basis_vec = np.zeros((IMAGE_ROWS, IMAGE_COLS, NUM_CHANNELS))
basis_vec_flat = basis_vec.reshape(-1)
basis_vec_flat[group_indices[j]] = 1.
# Generate perturbed images
image_plus_i = np.clip(curr_sample + delta * basis_vec, CLIP_MIN, CLIP_MAX)
image_minus_i = np.clip(curr_sample - delta * basis_vec, CLIP_MIN, CLIP_MAX)
# Serialize perturbed images for submission
buf_plus = StringIO.StringIO()
mpimg.imsave(buf_plus, np.round(image_plus_i).astype(np.uint8), format='png')
buffers.append(buf_plus)
buf_minus = StringIO.StringIO()
mpimg.imsave(buf_minus, np.round(image_minus_i).astype(np.uint8), format='png')
buffers.append(buf_minus)
# Submit the perturbed images
num_queries = num_groups * 2
inputs = [ClImage(file_obj=buf) for buf in buffers]
batch_size = 30
num_batches = int(num_queries/batch_size)
result = []
if num_batches>0:
for i in range(num_batches):
curr_input = inputs[i*batch_size:(i+1)*batch_size]
result.extend(model.predict(curr_input)['outputs'])
curr_input = inputs[num_batches*batch_size:]
result.extend(model.predict(curr_input)['outputs'])
else:
result.extend(model.predict(inputs)['outputs'])
for buf in buffers:
buf.close()
# Extract the output
pred_plus_batch = np.zeros((num_groups, num_classes))
for pred_plus, output in zip(pred_plus_batch, result[0:num_queries:2]):
dict_reader(output['data']['concepts'], pred_plus)
pred_minus_batch = np.zeros((num_groups, num_classes))
for pred_minus, output in zip(pred_minus_batch, result[1:num_queries:2]):
dict_reader(output['data']['concepts'], pred_minus)
# Do the actual finite difference gradient estimate
group_grad_est = CW_est_batch(pred_plus_batch, pred_minus_batch, curr_target, max_index)
grad_est = np.zeros((IMAGE_ROWS, IMAGE_COLS, NUM_CHANNELS))
grad_est_flat = grad_est.reshape(-1)
for indices, single_grad_est in zip(group_indices, group_grad_est):
grad_est_flat[indices] = single_grad_est
# Getting gradient of the loss
# loss_grad = -1.0 * grad_est/p_t
loss_grad = grad_est
return loss_grad
parser = argparse.ArgumentParser()
parser.add_argument("target_image_name", help="Image to misclassify")
parser.add_argument("--target_model", type=str, default='moderation',
help="target model for attack")
parser.add_argument("--eps", type=int, default=16,
help="perturbation magnitude to use")
parser.add_argument("--num_iter", type=int, default=5,
help="number of iterations to run")
parser.add_argument("--group_size", type=int, default=10000,
help="Number of features to group together")
parser.add_argument("--delta", type=float, default=1.0,
help="local perturbation")
args = parser.parse_args()
app = ClarifaiApp()
model = app.models.get(args.target_model)
time1 = time.time()
success = 0
avg_l2_perturb = 0
curr_image=args.target_image_name+'.jpg'
curr_sample = np.array(mpimg.imread(curr_image),dtype=float)
array_shape = curr_sample.shape
if len(curr_sample.shape)>2:
curr_sample=curr_sample[:,:,:3]
else:
curr_sample = curr_sample.reshape((array_shape[0],array_shape[1],1))
BATCH_SIZE=1
IMAGE_ROWS=curr_sample.shape[0]
IMAGE_COLS=curr_sample.shape[1]
NUM_CHANNELS=curr_sample.shape[2]
dim=IMAGE_ROWS*IMAGE_COLS*NUM_CHANNELS
delta=args.delta
CLIP_MIN=0
CLIP_MAX=255
group_size=args.group_size
eps=args.eps
norm='linf'
alpha = float(args.eps/args.num_iter)
if args.target_model == 'moderation':
num_classes = 5
elif args.target_model == 'nsfw-v1.0':
num_classes = 2
curr_prediction = np.zeros((num_classes))
image_cl=ClImage(file_obj=open(curr_image,'rb'))
curr_predict_dict = model.predict([image_cl])['outputs'][0]['data']['concepts']
curr_prediction = dict_reader(curr_predict_dict, curr_prediction)
orig_index = np.argmax(curr_prediction)
print("Original prediction: {}".format(curr_prediction))
temp_sample = curr_sample
temp_image = curr_image
curr_target = 0
for i in range(args.num_iter):
image_cl=ClImage(file_obj=open(temp_image,'rb'))
temp_prediction = np.zeros((num_classes))
temp_predict_dict = model.predict([image_cl])['outputs'][0]['data']['concepts']
temp_prediction = dict_reader(temp_predict_dict, temp_prediction)
temp_logits = np.log(temp_prediction)
max_index = np.argmax(temp_prediction)
loss_value = temp_logits[orig_index] - temp_logits[curr_target]
print('Current loss value: {}'.format(loss_value))
print('Current prediction: {}'.format(temp_prediction))
p_t = temp_prediction[curr_target]
loss_grad = finite_diff_method(temp_sample, curr_target, p_t, max_index)
# Getting signed gradient of loss
if norm == 'linf':
normed_loss_grad = np.sign(loss_grad)
elif norm == 'l2':
grad_norm = np.linalg.norm(loss_grad.reshape(dim))
normed_loss_grad = np.zeros_like(curr_sample)
normed_loss_grad = loss_grad/grad_norm
# eps_mod = eps - args.alpha
image_adv = temp_sample - alpha * normed_loss_grad
r = np.clip(image_adv-curr_sample, -eps, eps)
temp_sample = np.clip(curr_sample + r, CLIP_MIN, CLIP_MAX)
temp_image = args.target_image_name+'temp.png'
mpimg.imsave(temp_image, np.round(temp_sample).astype(np.uint8))
x_adv = args.target_image_name+'_adv_'+str(args.eps)+'_'+str(args.num_iter)+'_'+str(args.delta)+'_'+str(args.group_size)+'.png'
mpimg.imsave(x_adv, np.round(temp_sample).astype(np.uint8))
# Getting the norm of the perturbation
perturb_norm = np.linalg.norm((image_adv-curr_sample).reshape(dim))
perturb_norm_batch = np.mean(perturb_norm)
avg_l2_perturb += perturb_norm_batch
image_adv_cl=ClImage(file_obj=open(x_adv,'rb'))
adv_prediction = np.zeros((num_classes))
adv_predict_dict = model.predict([image_adv_cl])['outputs'][0]['data']['concepts']
adv_prediction = dict_reader(adv_predict_dict, adv_prediction)
adv_logits = np.log(adv_prediction)
loss_value = adv_logits[orig_index] - adv_logits[curr_target]
success += np.sum(np.argmax(adv_prediction) == curr_target)
success = 100.0 * float(success)
print('Final loss: {}'.format(loss_value))
print('Final prediction: {}'.format(adv_prediction))
print('Success: {}'.format(success))
ofile=open(args.target_image_name+'.txt','a')
ofile.write('eps: {}, num_iter: {}, group_size: {}, delta: {}, model: {} ---- success: {} \n'.format(eps, args.num_iter, args.group_size, args.delta, args.target_model, success))
ofile.write("Original prediction: {} \n".format(curr_prediction))
ofile.write("Final prediction: {}\n".format(adv_prediction))
ofile.close()
# success = 100.0 - success
time2 = time.time()
print('Average l2 perturbation: {}'.format(avg_l2_perturb))
print('Total time: {}'.format(time2-time1))