diff --git a/README.md b/README.md new file mode 100644 index 0000000..4a2e9ae --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +# Implementation of [Adversarial Training for Free!](https://arxiv.org/abs/1904.12843) + +the implementation is in the src folder and the pytorch layer `AdversarialForFree` does the work, see the documentation in the code for more information. + +The notebooks contain a few tests. The `con` notebook is a test on CIFAR-10 with a wide resnet where we test the properties mentioned with `m=4` and compare it with projected gradient descent adversarial training on performance and time. Furtheremore, we also investigate the cost of training with replays. \ No newline at end of file diff --git a/con.ipynb b/con.ipynb index 77028d1..0383c8e 100644 --- a/con.ipynb +++ b/con.ipynb @@ -13,12 +13,13 @@ "import torch.nn.functional as F\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", - "from collections import OrderedDict\n", + "from collections import OrderedDict, defaultdict\n", "import torch.optim as optim\n", "import time\n", "from src import *\n", "import math\n", - "from tqdm import tqdm" + "import pickle\n", + "import pandas as pd" ] }, { @@ -26,19 +27,71 @@ "execution_count": 2, "metadata": {}, "outputs": [], + "source": [ + "plt.rc('text', usetex=True)\n", + "%config InlineBackend.figure_format = 'retina'\n", + "!mkdir -p figures\n", + "!mkdir -p snapshots" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], "source": [ "# parameters\n", + "# PGK\n", "ϵ = 8 / 256\n", - "K = 7\n", + "ϵ_s = 2 / 256\n", + "\n", + "weight_decay = 1e-4\n", + "\n", + "val_K = 10\n", "retrain = 10\n", - "epoch_count = 300\n", + "EPOCHS = 300\n", + "TEST_EVERY = 30\n", "batch_size = 128\n", - "pre_train = False" + "pre_train = False\n", + "\n", + "small = False\n", + "training_with_replay_Ks = [1, 4, 10, 30]\n", + "free_Ks = [1, 2, 4, 10, 20]\n", + "\n", + " \n", + "PGD_Ks = [1, 2, 7]\n", + "\n", + "\n", + "attack_names = ['FSM', 'PGD-20', 'PGD-100', 'CW-100']\n", + "attacks = [\n", + " *[PGD(K, ϵ, 2.5 * ϵ/K) for K in [1, 20, 100]],\n", + " CW(100, 1e4, ϵ, 2.5 * ϵ/ 100)]\n", + " \n", + " \n", + "if small:\n", + " EPOCHS = 5\n", + " TEST_EVERY = 5\n", + " training_with_replay_Ks = [1, 5]\n", + " free_Ks = [1, 5]\n", + " attack_names = ['FSM', 'PGD-2', 'CW-2']\n", + " attacks = [\n", + " *[PGD(K, ϵ, 2.5 * ϵ/K) for K in [1, 2]],\n", + " CW(2, 1e4, ϵ, 2.5 * ϵ/ 2)]" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "assert all(EPOCHS == K * int(EPOCHS / K) for K in training_with_replay_Ks)\n", + "assert all(EPOCHS == K * int(EPOCHS / K) for K in free_Ks)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -62,28 +115,33 @@ " transforms.ToTensor()\n", "])\n", "\n", - "trainset = torchvision.datasets.CIFAR10(root='./data', train=True,\n", + "trainset = torchvision.datasets.CIFAR10(root='./data', train=True, \n", " download=True, transform=transform)\n", + "if small:\n", + " trainset = torch.utils.data.Subset(trainset, range(batch_size))\n", "\n", - "def trainloadicator(b):\n", - " return torch.utils.data.DataLoader(trainset, \n", - " batch_size=b,\n", + "trainloader = torch.utils.data.DataLoader(trainset, \n", + " batch_size=batch_size,\n", " shuffle=True, num_workers=4, \n", " pin_memory=True, drop_last=True)\n", - "trainloader = trainloadicator(batch_size)\n", "\n", "testset = torchvision.datasets.CIFAR10(root='./data', train=False,\n", " download=True, transform=transform_test)\n", - "testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,\n", + "\n", + "if small:\n", + " testset = torch.utils.data.Subset(testset, range(batch_size))\n", + " \n", + "testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size * 4,\n", " shuffle=False, num_workers=4)\n", "\n", + "\n", "dataiter = iter(trainloader)\n", "images, labels = dataiter.next()" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -92,7 +150,7 @@ "torch.Size([3, 32, 32])" ] }, - "execution_count": 4, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -102,104 +160,102 @@ "\n", "criterion = nn.CrossEntropyLoss()\n", "\n", - "def mkmodel(ϵ=ϵ):\n", + "def build_model(ϵ=ϵ):\n", " model = WideResNet(28, 10, 10, 0.1)\n", " adv = AdversarialForFree(ϵ, 0, 1)\n", - " return nn.Sequential(OrderedDict([\n", - " ('adv', adv),\n", + " if ϵ not in [0, False]:\n", + " l = [('adv', adv)]\n", + " else:\n", + " l = []\n", + " l.extend([\n", " ('normalizer', norm),\n", - " ('resnet', model)])).cuda()\n", + " ('resnet', model)])\n", + " return nn.Sequential(OrderedDict(l)).cuda()\n", "\n", "imgsize = images.size()[1:]\n", "imgsize" ] }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + " torch.Tensor>" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "F.relu" + ] + }, { "cell_type": "code", "execution_count": null, - "metadata": { - "tags": [] - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "setting batch size to 128\n", - "train \t 1: 2.0249 24.0% 496.8s\n", - "val \t 1: 1.7618 35.3% 4.8s\n" + "\n", + "\n", + "\n", + "\n", + "\n", + "------------------------\n", + "\n", + "\n", + "\n", + " training with 1 replays\n", + "train \t 1: 1.5880 40.0% 71.2s\n", + "train \t 2: 1.1595 57.8% 71.2s\n", + "train \t 3: 0.9540 65.9% 71.5s\n", + "train \t 4: 0.8208 71.3% 71.4s\n", + "train \t 5: 0.7153 74.9% 71.3s\n", + "train \t 6: 0.6392 77.7% 71.2s\n", + "train \t 7: 0.5713 80.4% 71.1s\n", + "train \t 8: 0.5228 81.9% 71.0s\n", + "train \t 9: 0.4808 83.5% 70.9s\n", + "train \t 10: 0.4478 84.7% 70.8s\n", + "train \t 11: 0.4202 85.6% 70.8s\n" ] } ], "source": [ - "train_loader = None\n", - "train_batch = None\n", - "\n", - "model = mkmodel()\n", - "optimizer = optim.Adam(model.parameters())\n", - "\n", - "for epoch in range(math.ceil(epoch_count / K)): # loop over the dataset multiple times\n", - " \n", - " b = 128\n", - " if train_batch != b:\n", - " print(f'setting batch size to {b}')\n", - " train_loader = trainloadicator(b)\n", - " train_batch = b\n", - "\n", - " logs = Logisticator()\n", - " model.train()\n", - " \n", - " for i, data in enumerate(train_loader, 0):\n", - " # get the inputs; data is a list of [inputs, labels]\n", - " inputs, labels = map(lambda x: x.cuda(), data)\n", - " for k in range(K):\n", - " # zero the parameter gradients\n", - " optimizer.zero_grad()\n", - " \n", - " # forward + backward + optimize\n", - " outputs = model(inputs)\n", - " loss = F.cross_entropy(outputs, labels)\n", - " loss.backward()\n", - " \n", - " optimizer.step()\n", - " model.adv.step()\n", - " \n", - " acc = accuracy(outputs, labels)\n", - " logs.add(acc, loss.item(), inputs.size(0))\n", - " \n", - " print(f'train \\t {epoch + 1}: {logs}')\n", - " \n", - " \n", - " \n", - " model.train(False)\n", - " # valdiation loss\n", - " with torch.no_grad():\n", - " logs = Logisticator()\n", - " for data in testloader:\n", - " inputs, labels = map(lambda x: x.cuda(), data)\n", - " outputs = model(inputs)\n", - " loss = F.cross_entropy(outputs, labels)\n", - "\n", - " acc = accuracy(outputs, labels)\n", - " logs.add(acc, loss.item(), inputs.size(0))\n", - " \n", - " print(f'val \\t {epoch + 1}: {logs}')\n", - " \n", - " # adv loss\n", - " logs = Logisticator()\n", - " for data in testloader:\n", - " inputs, labels = map(lambda x: x.cuda(), data)\n", - " noise = PGK(model, lambda x: F.cross_entropy(x, labels), inputs, ϵ, K)\n", - " \n", - " with torch.no_grad():\n", - " outputs = model(inputs + noise)\n", - " loss = F.cross_entropy(outputs, labels)\n", + "#standard training with replay logs\n", + "srl = defaultdict(lambda : defaultdict(lambda : []))\n", "\n", - " acc = accuracy(outputs, labels)\n", - " logs.add(acc, loss.item(), inputs.size(0))\n", - " print(f'adv \\t {epoch + 1}: {logs}')\n", - " \n", - "print('Finished Training')\n" + "for K in training_with_replay_Ks:\n", + " print(f'\\n\\n\\n\\n\\n------------------------\\n\\n\\n\\n training with {K} replays')\n", + "\n", + " model = build_model(False)\n", + " optimizer = optim.Adam(model.parameters(), weight_decay=weight_decay)\n", + " \n", + " for epoch in range(int(EPOCHS / K)): # loop over the dataset multiple times\n", + " \n", + " logs = train_with_replay(K, model, trainloader, optimizer, epoch)\n", + " srl[K]['train'].append(logs)\n", + " if (epoch * K + K) % TEST_EVERY == 0:\n", + " # valdiation loss\n", + " logs = run_val(model, testloader, epoch)\n", + " srl[K]['test'].append(logs)\n", + " run_attacks(srl[K], attacks, \n", + " attack_names, model, testloader, epoch)\n", + " print('Finished Training')\n", + " torch.save(model.state_dict(), f\"wresnet-cifar-10-normal-{K}.pch\")\n", + " del model\n", + " torch.cuda.empty_cache()\n", + "\n", + "with open('snapshots/srl.pickle', 'wb') as fd:\n", + " pickle.dump(holder_to_dict(srl), fd)" ] }, { @@ -208,86 +264,67 @@ "metadata": {}, "outputs": [], "source": [ - "del model\n", - "torch.cuda.empty_cache() " + "fig, (ax2, ax1) = plt.subplots(ncols=2, figsize=(15,7))\n", + "\n", + "y = [srl[K][\"test\"][-1].acc * 100 for K in training_with_replay_Ks]\n", + "bars = ax1.bar([f'$m={K}$' for K in training_with_replay_Ks], y)\n", + "for (i, bar) in zip(y, bars):\n", + " t = ax1.text(bar.get_x() + bar.get_width() /2 - 0.07 , bar.get_height() + 0.10, f'{i:0.1f}%')\n", + "for ax in [ax1, ax2]:\n", + " ax.set_xlabel('number of replay steps $m$')\n", + "ax1.set_ylabel('validation accuracy ($\\%$)')\n", + "\n", + "ax2.set_ylabel('validation loss (KL)')\n", + "y = [srl[K][\"test\"][-1].loss for K in training_with_replay_Ks]\n", + "bars = ax2.bar([f'$m={K}$' for K in training_with_replay_Ks], y)\n", + "for (i, bar) in zip(y, bars):\n", + " t = ax2.text(bar.get_x() + bar.get_width() /2 - 0.07 , bar.get_height() + 0.10, f'{i:0.1f}')\n", + "def savefig(fig, name, f=['svg', 'pdf', 'png']):\n", + " for e in f:\n", + " fig.savefig('figures/' + name + '.' + e)\n", + "savefig(fig, 'cost_of_replay')\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "train_loader = None\n", - "train_batch = None\n", - "\n", - "model = mkmodel(0)\n", - "optimizer = optim.Adam(model.parameters(), weight_decay=5e-4)\n", - "\n", - "for epoch in range(math.ceil(epoch_count / K)): # loop over the dataset multiple times\n", - " \n", - " b = 128\n", - " if train_batch != b:\n", - " print(f'setting batch size to {b}')\n", - " train_loader = trainloadicator(b)\n", - " train_batch = b\n", - "\n", - " logs = Logisticator()\n", - " model.train()\n", - " \n", - " for i, data in enumerate(train_loader, 0):\n", - " # get the inputs; data is a list of [inputs, labels]\n", - " inputs, labels = map(lambda x: x.cuda(), data)\n", - " for k in range(K):\n", - " # zero the parameter gradients\n", - " optimizer.zero_grad()\n", - " \n", - " # forward + backward + optimize\n", - " outputs = model(inputs)\n", - " loss = F.cross_entropy(outputs, labels)\n", - " loss.backward()\n", - " \n", - " optimizer.step()\n", - " model.adv.step()\n", - " \n", - " acc = accuracy(outputs, labels)\n", - " logs.add(acc, loss.item(), inputs.size(0))\n", - " \n", - " print(f'train \\t {epoch + 1}: {logs}')\n", - " \n", - " \n", - " \n", - " model.train(False)\n", - " # valdiation loss\n", - " with torch.no_grad():\n", - " logs = Logisticator()\n", - " for data in testloader:\n", - " inputs, labels = map(lambda x: x.cuda(), data)\n", - " outputs = model(inputs)\n", - " loss = F.cross_entropy(outputs, labels)\n", - "\n", - " acc = accuracy(outputs, labels)\n", - " logs.add(acc, loss.item(), inputs.size(0))\n", - " \n", - " print(f'val \\t {epoch + 1}: {logs}')\n", - " \n", - " # adv loss\n", - " logs = Logisticator()\n", - " for data in testloader:\n", - " inputs, labels = map(lambda x: x.cuda(), data)\n", - " noise = PGK(model, lambda x: F.cross_entropy(x, labels), inputs, ϵ, K)\n", + "free_logs = defaultdict(lambda : defaultdict(lambda :[]))\n", + "\n", + "for K in free_Ks:\n", + " model = build_model()\n", + " optimizer = optim.Adam(model.parameters())\n", + "\n", + " for epoch in range(int(EPOCHS / K)): # loop over the dataset multiple times\n", + " logs = train_with_replay(K, model, trainloader, optimizer, epoch,\n", + " after_func=lambda model: model.adv.step())\n", + " free_logs[K]['train'].append(logs)\n", " \n", - " with torch.no_grad():\n", - " outputs = model(inputs + noise)\n", - " loss = F.cross_entropy(outputs, labels)\n", + " if (epoch * K + K) % TEST_EVERY == 0:\n", + "\n", + " logs = run_val(model, testloader, epoch)\n", + " free_logs[K]['test'].append(logs)\n", + "\n", + " # adv loss\n", + " run_attacks(free_logs[K], attacks, attack_names, model, testloader, epoch)\n", + " \n", "\n", - " acc = accuracy(outputs, labels)\n", - " logs.add(acc, loss.item(), inputs.size(0))\n", - " print(f'adv \\t {epoch + 1}: {logs}')\n", + " print('Finished Training')\n", + " torch.save(model.state_dict(), f\"snapshots/wresnet-cifar-10-free-{K}.pch\")\n", + " del model\n", + " torch.cuda.empty_cache()\n", " \n", - "print('Finished Training')\n", - "del model\n", - "torch.cuda.empty_cache() " + "with open('snapshots/free_logs.pickle', 'wb') as fd:\n", + " pickle.dump(holder_to_dict(free_logs), fd)" ] }, { @@ -295,78 +332,51 @@ "execution_count": null, "metadata": {}, "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ - "train_loader = None\n", - "train_batch = None\n", - "\n", - "model = mkmodel(0)\n", - "optimizer = optim.Adam(model.parameters(), weight_decay=5e-4)\n", - "\n", - "for epoch in range(math.ceil(epoch_count / K)): # loop over the dataset multiple times\n", - " \n", - " b = 128\n", - " if train_batch != b:\n", - " print(f'setting batch size to {b}')\n", - " train_loader = trainloadicator(b)\n", - " train_batch = b\n", - "\n", - " logs = Logisticator()\n", - " model.train()\n", - " \n", - " for i, data in enumerate(train_loader, 0):\n", - " # get the inputs; data is a list of [inputs, labels]\n", - " inputs, labels = map(lambda x: x.cuda(), data)\n", - " for k in range(K):\n", - " # zero the parameter gradients\n", - " optimizer.zero_grad()\n", - " noise = PGK(model, lambda x: F.cross_entropy(x, labels), inputs, ϵ, K)\n", - " optimizer.zero_grad()\n", - " # forward + backward + optimize\n", - " outputs = model(inputs + noise)\n", - " loss = F.cross_entropy(outputs, labels)\n", - " loss.backward()\n", - " \n", - " optimizer.step()\n", - " model.adv.step()\n", - " \n", - " acc = accuracy(outputs, labels)\n", - " logs.add(acc, loss.item(), inputs.size(0))\n", - " \n", - " print(f'train \\t {epoch + 1}: {logs}')\n", - " \n", - " \n", + "pgd_logs = defaultdict(lambda : defaultdict(lambda : []))\n", + "\n", + "for K in PGD_Ks:\n", + " model = build_model(False)\n", + " optimizer = optim.Adam(model.parameters())\n", " \n", - " model.train(False)\n", - " # valdiation loss\n", - " with torch.no_grad():\n", - " logs = Logisticator()\n", - " for data in testloader:\n", - " inputs, labels = map(lambda x: x.cuda(), data)\n", - " outputs = model(inputs)\n", - " loss = F.cross_entropy(outputs, labels)\n", - "\n", - " acc = accuracy(outputs, labels)\n", - " logs.add(acc, loss.item(), inputs.size(0))\n", - " \n", - " print(f'val \\t {epoch + 1}: {logs}')\n", + " attack = PGD(K, ϵ, 2.5 * ϵ / K)\n", " \n", - " # adv loss\n", - " logs = Logisticator()\n", - " for data in testloader:\n", - " inputs, labels = map(lambda x: x.cuda(), data)\n", - " noise = PGK(model, lambda x: F.cross_entropy(x, labels), inputs, ϵ, K)\n", + " for epoch in range(EPOCHS): # loop over the dataset multiple times\n", + " \n", + " \n", + " \n", + "\n", + " logs = train_with_replay(1, \n", + " model, \n", + " trainloader, \n", + " optimizer,\n", + " epoch,\n", + " input_func=lambda inputs, labels: attack(model, inputs, labels))\n", + " pgd_logs[K]['train'].append(logs)\n", " \n", - " with torch.no_grad():\n", - " outputs = model(inputs + noise)\n", - " loss = F.cross_entropy(outputs, labels)\n", + " if (epoch + 1) % TEST_EVERY == 0:\n", + " \n", + " logs = run_val(model, testloader, epoch)\n", + " pgd_logs[K]['test'].append(logs)\n", + " run_attacks(pgd_logs[K], attacks, \n", + " attack_names, model, testloader, epoch)\n", "\n", - " acc = accuracy(outputs, labels)\n", - " logs.add(acc, loss.item(), inputs.size(0))\n", - " print(f'adv \\t {epoch + 1}: {logs}')\n", + " print('Finished Training')\n", + " torch.save(model.state_dict(), f\"snapshots/wresnet-cifar-10-pgk-{K}.pch\")\n", + " del model\n", + " torch.cuda.empty_cache()\n", " \n", - "print('Finished Training')\n", - "del model\n", - "torch.cuda.empty_cache() " + "with open('snapshots/pgd_logs.pickle', 'wb') as fd:\n", + " pickle.dump(holder_to_dict(pgd_logs), fd)" ] }, { @@ -374,9 +384,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "K = 1" - ] + "source": [] }, { "cell_type": "code", @@ -384,74 +392,42 @@ "metadata": {}, "outputs": [], "source": [ - "train_loader = None\n", - "train_batch = None\n", - "\n", - "model = mkmodel()\n", - "optimizer = optim.Adam(model.parameters(), weight_decay=5e-4)\n", - "\n", - "for epoch in range(math.ceil(epoch_count / K)): # loop over the dataset multiple times\n", - " \n", - " b = 128\n", - " if train_batch != b:\n", - " print(f'setting batch size to {b}')\n", - " train_loader = trainloadicator(b)\n", - " train_batch = b\n", - "\n", - " logs = Logisticator()\n", - " model.train()\n", - " \n", - " for i, data in enumerate(train_loader, 0):\n", - " # get the inputs; data is a list of [inputs, labels]\n", - " inputs, labels = map(lambda x: x.cuda(), data)\n", - " for k in range(K):\n", - " # zero the parameter gradients\n", - " optimizer.zero_grad()\n", - " \n", - " # forward + backward + optimize\n", - " outputs = model(inputs)\n", - " loss = F.cross_entropy(outputs, labels)\n", - " loss.backward()\n", + "fmt = lambda x: f'$${x * 100:.2f}\\%$$'\n", + "d = {}\n", + "d['Training'] = ['Natural', \n", + " *[f'Free $m={K}$' for K in free_Ks],\n", + " *[f'{K}-PGD' for K in PGD_Ks]]\n", + "\n", + "\n", + "x = [srl[1]['test'][-1].acc,\n", + " *[free_logs[K]['test'][-1].acc for K in free_Ks],\n", + " *[pgd_logs[K]['test'][-1].acc for K in PGD_Ks]]\n", + "\n", + "d['Natural Images'] = list(map(fmt, x))\n", " \n", - " optimizer.step()\n", - " model.adv.step()\n", - " \n", - " acc = accuracy(outputs, labels)\n", - " logs.add(acc, loss.item(), inputs.size(0))\n", + "for name in attack_names:\n", + " n = f'adv_test/{name}'\n", " \n", - " print(f'train \\t {epoch + 1}: {logs}')\n", + " x = [srl[1][n][-1].acc]\n", " \n", + " for K in free_Ks:\n", + " x.append(free_logs[K][n][-1].acc)\n", " \n", - " \n", - " model.train(False)\n", - " # valdiation loss\n", - " with torch.no_grad():\n", - " logs = Logisticator()\n", - " for data in testloader:\n", - " inputs, labels = map(lambda x: x.cuda(), data)\n", - " outputs = model(inputs)\n", - " loss = F.cross_entropy(outputs, labels)\n", - "\n", - " acc = accuracy(outputs, labels)\n", - " logs.add(acc, loss.item(), inputs.size(0))\n", - " \n", - " print(f'val \\t {epoch + 1}: {logs}')\n", - " \n", - " # adv loss\n", - " logs = Logisticator()\n", - " for data in testloader:\n", - " inputs, labels = map(lambda x: x.cuda(), data)\n", - " noise = PGK(model, lambda x: F.cross_entropy(x, labels), inputs, ϵ, K)\n", + " for K in PGD_Ks:\n", + " x.append(pgd_logs[K][n][-1].acc)\n", + " d[name] = list(map(fmt, x))\n", " \n", - " with torch.no_grad():\n", - " outputs = model(inputs + noise)\n", - " loss = F.cross_entropy(outputs, labels)\n", + "tt = lambda x: sum(i.time for i in x)\n", + "fmt = lambda x: f'$${math.ceil(x / 60)}$$'\n", + "x = [srl[1]['train'],\n", + " *[free_logs[K]['train'] for K in free_Ks],\n", + " *[pgd_logs[K]['train'] for K in PGD_Ks]]\n", "\n", - " acc = accuracy(outputs, labels)\n", - " logs.add(acc, loss.item(), inputs.size(0))\n", - " print(f'adv \\t {epoch + 1}: {logs}')\n", - " \n", - "print('Finished Training')\n" + "d['Training Time(M)'] = list(map(lambda x: fmt(tt(x)), x))\n", + "\n", + "df = pd.DataFrame(d)\n", + "\n", + "df" ] }, { @@ -460,76 +436,7 @@ "metadata": {}, "outputs": [], "source": [ - "train_loader = None\n", - "train_batch = None\n", - "\n", - "model = mkmodel(0)\n", - "optimizer = optim.Adam(model.parameters(), weight_decay=5e-4)\n", - "\n", - "for epoch in range(math.ceil(epoch_count / K)): # loop over the dataset multiple times\n", - " \n", - " b = 128\n", - " if train_batch != b:\n", - " print(f'setting batch size to {b}')\n", - " train_loader = trainloadicator(b)\n", - " train_batch = b\n", - "\n", - " logs = Logisticator()\n", - " model.train()\n", - " \n", - " for i, data in enumerate(train_loader, 0):\n", - " # get the inputs; data is a list of [inputs, labels]\n", - " inputs, labels = map(lambda x: x.cuda(), data)\n", - " for k in range(K):\n", - " # zero the parameter gradients\n", - " optimizer.zero_grad()\n", - " \n", - " # forward + backward + optimize\n", - " outputs = model(inputs)\n", - " loss = F.cross_entropy(outputs, labels)\n", - " loss.backward()\n", - " \n", - " optimizer.step()\n", - " model.adv.step()\n", - " \n", - " acc = accuracy(outputs, labels)\n", - " logs.add(acc, loss.item(), inputs.size(0))\n", - " \n", - " print(f'train \\t {epoch + 1}: {logs}')\n", - " \n", - " \n", - " \n", - " model.train(False)\n", - " # valdiation loss\n", - " with torch.no_grad():\n", - " logs = Logisticator()\n", - " for data in testloader:\n", - " inputs, labels = map(lambda x: x.cuda(), data)\n", - " outputs = model(inputs)\n", - " loss = F.cross_entropy(outputs, labels)\n", - "\n", - " acc = accuracy(outputs, labels)\n", - " logs.add(acc, loss.item(), inputs.size(0))\n", - " \n", - " print(f'val \\t {epoch + 1}: {logs}')\n", - " \n", - " # adv loss\n", - " logs = Logisticator()\n", - " for data in testloader:\n", - " inputs, labels = map(lambda x: x.cuda(), data)\n", - " noise = PGK(model, lambda x: F.cross_entropy(x, labels), inputs, ϵ, K)\n", - " \n", - " with torch.no_grad():\n", - " outputs = model(inputs + noise)\n", - " loss = F.cross_entropy(outputs, labels)\n", - "\n", - " acc = accuracy(outputs, labels)\n", - " logs.add(acc, loss.item(), inputs.size(0))\n", - " print(f'adv \\t {epoch + 1}: {logs}')\n", - " \n", - "print('Finished Training')\n", - "del model\n", - "torch.cuda.empty_cache() " + "df.to_csv('figures/grid.csv')" ] }, { @@ -538,77 +445,7 @@ "metadata": {}, "outputs": [], "source": [ - "train_loader = None\n", - "train_batch = None\n", - "\n", - "model = mkmodel(0)\n", - "optimizer = optim.Adam(model.parameters(), weight_decay=5e-4)\n", - "\n", - "for epoch in range(math.ceil(epoch_count / K)): # loop over the dataset multiple times\n", - " \n", - " b = 128\n", - " if train_batch != b:\n", - " print(f'setting batch size to {b}')\n", - " train_loader = trainloadicator(b)\n", - " train_batch = b\n", - "\n", - " logs = Logisticator()\n", - " model.train()\n", - " \n", - " for i, data in enumerate(train_loader, 0):\n", - " # get the inputs; data is a list of [inputs, labels]\n", - " inputs, labels = map(lambda x: x.cuda(), data)\n", - " for k in range(K):\n", - " # zero the parameter gradients\n", - " optimizer.zero_grad()\n", - " noise = PGK(model, lambda x: F.cross_entropy(x, labels), inputs, ϵ, K)\n", - " optimizer.zero_grad()\n", - " # forward + backward + optimize\n", - " outputs = model(inputs + noise)\n", - " loss = F.cross_entropy(outputs, labels)\n", - " loss.backward()\n", - " \n", - " optimizer.step()\n", - " model.adv.step()\n", - " \n", - " acc = accuracy(outputs, labels)\n", - " logs.add(acc, loss.item(), inputs.size(0))\n", - " \n", - " print(f'train \\t {epoch + 1}: {logs}')\n", - " \n", - " \n", - " \n", - " model.train(False)\n", - " # valdiation loss\n", - " with torch.no_grad():\n", - " logs = Logisticator()\n", - " for data in testloader:\n", - " inputs, labels = map(lambda x: x.cuda(), data)\n", - " outputs = model(inputs)\n", - " loss = F.cross_entropy(outputs, labels)\n", - "\n", - " acc = accuracy(outputs, labels)\n", - " logs.add(acc, loss.item(), inputs.size(0))\n", - " \n", - " print(f'val \\t {epoch + 1}: {logs}')\n", - " \n", - " # adv loss\n", - " logs = Logisticator()\n", - " for data in testloader:\n", - " inputs, labels = map(lambda x: x.cuda(), data)\n", - " noise = PGK(model, lambda x: F.cross_entropy(x, labels), inputs, ϵ, K)\n", - " \n", - " with torch.no_grad():\n", - " outputs = model(inputs + noise)\n", - " loss = F.cross_entropy(outputs, labels)\n", - "\n", - " acc = accuracy(outputs, labels)\n", - " logs.add(acc, loss.item(), inputs.size(0))\n", - " print(f'adv \\t {epoch + 1}: {logs}')\n", - " \n", - "print('Finished Training')\n", - "del model\n", - "torch.cuda.empty_cache() " + "df.to_latex('figures/grid.tex')" ] }, { diff --git a/src/attacks.py b/src/attacks.py index 56d09cb..49a660e 100644 --- a/src/attacks.py +++ b/src/attacks.py @@ -1,23 +1,95 @@ +from turtle import backward + import torch -from .utils import tensor_zero_grad -''' -Projected gradient descent attack with k steps +import torch.nn.functional as F +from .utils import tensor_zero_grad, Logisticator +''' Projected gradient descent attack with k steps, M + +model: an object that has .zero_grad() and __call__(batch) -> pred + +criterion: a loss function that takes ONE argument + use a lambda function to convert the l(x, y) to l(x) + +x: an input batch +K: number of training steps ''' -def PGK(model, criterion, x, e, K, min=0, max=1): - noise = torch.zeros_like(x, device=x.device, requires_grad=True) - for k in range(K): + +class Attack: + def __init__(self, K, e, e_s, min=0, max=1) -> None: + self.K = K + self.e = e + self.e_s = e_s + self.min = min + self.max = max + + def step(self, noise): + with torch.no_grad(): + noise.grad.sign_() + noise += self.e_s * noise.grad + noise.clamp_(-self.e, self.e) + + def zero(self, model, noise): model.zero_grad() tensor_zero_grad(noise) + + def init_noise(self, x): + a = torch.rand(x.shape, device=x.device) * 2 * self.e - self.e + a.requires_grad_() + return a - outputs = model((x + noise).clamp(min, max)) + def run(self, model, x, noise): + return model(self.adv(x, noise)) + + def adv(self, x, noise): + return (x + noise).clamp(self.min, self.max) - loss = criterion(outputs) - loss.backward() + def finalize(self, noise): + noise.requires_grad_(False) +class PGD(Attack): + def __init__(self, K, e, e_s, min=0, max=1, loss=F.cross_entropy) -> None: + super().__init__(K, e, e_s, min, max) + self.loss = loss - with torch.no_grad(): - noise.grad.sign_() - noise += e * noise.grad - noise.clamp_(-e, e) + def __call__(self, model, x, y): + noise = self.init_noise(x) + + for _ in range(self.K): + self.zero(model, noise) + + p = self.run(model, x, noise) + + loss = self.loss(p, y) + loss.backward() + + self.step(noise) + + self.zero(model, noise) + self.finalize(noise) + return self.adv(x, noise) + +class CW(Attack): + def __init__(self, K, c, e, e_s, min=0, max=1) -> None: + super().__init__(K, e, e_s, min, max) + self.c = c - return noise - \ No newline at end of file + def __call__(self, model, x, y): + noise = self.init_noise(x) + mask = None + + for _ in range(self.K): + self.zero(model, noise) + + p = self.run(model, x, noise) + + if mask is None: + mask = torch.eye(p.shape[1], device=x.device)[y, :] + + correct_logit = p[torch.arange(p.shape[0]), y] + wrong_logit = ((1 - mask) * p - self.c * mask).max(axis=1)[0] + loss = F.relu(correct_logit - wrong_logit).sum() + loss.backward() + + self.zero(model, noise) + self.finalize(noise) + return self.adv(x, noise) + diff --git a/src/layers.py b/src/layers.py index e5b6582..cf8952a 100644 --- a/src/layers.py +++ b/src/layers.py @@ -3,25 +3,28 @@ import torch.nn.functional as F from .utils import * -""" -shift the average to 0 and the variance to 1 +""" Standard scaler as a layer + +Shifts the average to zero and scales the standard deviation to one. """ class StandardScalerLayer(nn.Module): """ data: function that returns an iterator - keep_axis: iterable for axixes to skeep defaults keep second argument + keep_axis: iterable for dims to keep + defaults keep second argument """ + def __init__(self, data, keep_dims=[1]): super(StandardScalerLayer, self).__init__() - - self.keep_dims=keep_dims + + self.keep_dims = keep_dims c = Collectinator(torch.zeros(3)) for inputs in data(): inputs_mean = mean_keepdim(inputs, keep_dims) c.add(inputs_mean, inputs.size(0)) - mean = c.mean - + mean = c.mean + c = Collectinator(torch.zeros(3)) for inputs in data(): m = inputs.size(0) @@ -35,8 +38,8 @@ def __init__(self, data, keep_dims=[1]): # so that optimizers don't change them self.register_buffer('mean', mean) self.register_buffer('std', std) - - def forward(self, x): + + def forward(self, x): return (x - self.reshape(x, self.mean)) / self.reshape(x, self.std) def reshape(self, x, y): @@ -45,20 +48,45 @@ def reshape(self, x, y): y = torch.unsqueeze(y, i) return y -''' -add adversarial noise to the input, -you need to call .step AFTER updating the gradient -m needs to be a buffer so that it doesn't get updated, -changing input shapes causes the buffer to reset. -set min and max to meaning ful values or (infty) + +''' Adversarial for free Layer + +This layer adds the noise as described in the paper +Adversarial training for free. + +Attributes +---------- +e: usually a float + ensures that the noise $||noise||_\infty \leq e$ +min, max: usually a float + clamp the output so that the adversarial example stays valid + set the $-\infty$ and $\infty$ to disable it +auto_zer_grad: bool + reset the gradient of the noise every time forward is called + IF YOU ARE ACCUMULATING GRADIENT, you need to disable it and + clear the gradient manually + +Usage notes +----------- + +- call .step() after calculating the gradient to update the noise +- if the size of the input changes, the buffer is dropped +- the noise is stored as a buffer so it wont show up in model.parameters + BUT model.to() will move the buffers to the desired device ''' class AdversarialForFree(nn.Module): - def __init__(self, e, min=0, max=1): + def __init__(self, e, min=0, max=1, + auto_zero_grad: bool = True): + super(AdversarialForFree, self).__init__() self.e = e self.min, self.max = min, max + self.auto_zero_grad = auto_zero_grad + + def forward(self, x, auto_zero_grad=None): + if auto_zero_grad is None: + auto_zero_grad = self.auto_zero_grad - def forward(self, x, auto_zero_grad=True): if hasattr(self, 'm') and auto_zero_grad: self.zero_grad() @@ -69,7 +97,7 @@ def forward(self, x, auto_zero_grad=True): return (x + self.m).clamp(self.min, self.max) else: return x - + def step(self): with torch.no_grad(): self.m.grad.sign_() @@ -79,7 +107,7 @@ def step(self): def zero_grad(self): if hasattr(self, 'm'): tensor_zero_grad(self.m) - + def clean(self): if hasattr(self, 'm'): - delattr(self, 'm') \ No newline at end of file + delattr(self, 'm') diff --git a/src/utils.py b/src/utils.py index 93536a2..b24ecc5 100644 --- a/src/utils.py +++ b/src/utils.py @@ -1,6 +1,8 @@ +from collections import defaultdict import time import torch - +import torch.nn.functional as F +"""Set the gradient of a tensor to zero, just like torch.optim.Optimizer""" def tensor_zero_grad(x, set_to_none: bool = False): if x.grad is not None: if set_to_none: @@ -11,8 +13,10 @@ def tensor_zero_grad(x, set_to_none: bool = False): else: x.grad.requires_grad_(False) x.grad.zero_() - """ -copyright doofenshmirtz co. + +""" A batched average tracker + + """ class Collectinator: def __init__(self, mean=0): @@ -23,19 +27,40 @@ def add(self, v, m=1): self.mean = self.mean * (self.n / nm) + v * (m / nm) self.n = nm + +""" A performance tracker + +Track accuracy, loss and runtime +""" class Logisticator: def __init__(self) -> None: - self.acc = Collectinator() - self.loss = Collectinator() + self._acc = Collectinator() + self._loss = Collectinator() + self.acc = 0 + self.loss = 0 self.now = time.time() - + self.end_time = None + def add(self, acc, loss, m): - self.acc.add(acc, m) - self.loss.add(loss, m) + self._acc.add(acc, m) + self.acc = self._acc.mean + self._loss.add(loss, m) + self.loss = self._loss.mean def __str__(self): - return f'{self.loss.mean:.4f} {self.acc.mean * 100:0.1f}% {time.time() - self.now:.1f}s' + self.end() + return f'{self.loss:.4f} {self.acc * 100:0.1f}% {self.end_time - self.now:.1f}s' + + def end(self): + if self.end_time is None: + self.end_time = time.time() + self.time = self.end_time - self.now + +""" +Take the average but instead of reducing the dims, keep them +Is there a pytorch version already? +""" def mean_keepdim(inputs, dims): d = [i for i in range(len(inputs.shape)) if i not in dims] @@ -43,4 +68,71 @@ def mean_keepdim(inputs, dims): def accuracy(outputs, labels): preds = torch.argmax(outputs, axis=1) - return torch.sum(preds == labels).item() / len(preds) \ No newline at end of file + return torch.sum(preds == labels).item() / len(preds) + +def train_with_replay(K, model, trainloader, optimizer, epoch, + input_func=lambda x, y: x, + after_func=lambda model: None): + logs = Logisticator() + + model.train() + + for data in trainloader: + # get the inputs; data is a list of [inputs, labels] + inputs, labels = map(lambda x: x.cuda(), data) + for k in range(K): + inputs = input_func(inputs, labels) + # zero the parameter gradients + optimizer.zero_grad() + + # forward + backward + optimize + outputs = model(inputs) + loss = F.cross_entropy(outputs, labels) + loss.backward() + + optimizer.step() + after_func(model) + + acc = accuracy(outputs, labels) + logs.add(acc, loss.item(), inputs.size(0)) + print(f'train \t {epoch + 1}: {logs}') + return logs +def run_val(model, testloader, epoch): + model.train(False) + # valdiation loss + with torch.no_grad(): + logs = Logisticator() + + for data in testloader: + inputs, labels = map(lambda x: x.cuda(), data) + outputs = model(inputs) + loss = F.cross_entropy(outputs, labels) + + acc = accuracy(outputs, labels) + logs.add(acc, loss.item(), inputs.size(0)) + + print(f'val \t {epoch + 1}: {logs}') + return logs +def run_attacks(logholder, attacks, attack_names, model, testloader, epoch): + model.train(False) + for (attack, name) in zip(attacks, attack_names): + logs = Logisticator() + logholder[f'adv_test/{name}'].append(logs) + for data in testloader: + inputs, labels = map(lambda x: x.cuda(), data) + adv = attack(model, inputs, labels) + + with torch.no_grad(): + outputs = model(adv) + loss = F.cross_entropy(outputs, labels) + + acc = accuracy(outputs, labels) + logs.add(acc, loss.item(), inputs.size(0)) + print(f'adv {name} \t\t\t {epoch + 1}: {logs}') + +def bimap(f, g, x): + return [(f(a), g(b)) for (a, b) in x] +def identity(x): + return x +def holder_to_dict(holder: defaultdict): + return dict(bimap(identity, dict, holder.items())) \ No newline at end of file