diff --git a/cnn/CNN_application.ipynb b/cnn/CNN_application.ipynb new file mode 100644 index 0000000..583aa40 --- /dev/null +++ b/cnn/CNN_application.ipynb @@ -0,0 +1,983 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Convolutional Neural Networks: Application\n", + "\n", + "Welcome to Course 4's second assignment! In this notebook, you will:\n", + "\n", + "- Implement helper functions that you will use when implementing a TensorFlow model\n", + "- Implement a fully functioning ConvNet using TensorFlow \n", + "\n", + "**After this assignment you will be able to:**\n", + "\n", + "- Build and train a ConvNet in TensorFlow for a classification problem \n", + "\n", + "We assume here that you are already familiar with TensorFlow. If you are not, please refer the *TensorFlow Tutorial* of the third week of Course 2 (\"*Improving deep neural networks*\")." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.0 - TensorFlow model\n", + "\n", + "In the previous assignment, you built helper functions using numpy to understand the mechanics behind convolutional neural networks. Most practical applications of deep learning today are built using programming frameworks, which have many built-in functions you can simply call. \n", + "\n", + "As usual, we will start by loading in the packages. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import math\n", + "import numpy as np\n", + "import h5py\n", + "import matplotlib.pyplot as plt\n", + "import scipy\n", + "from PIL import Image\n", + "from scipy import ndimage\n", + "import tensorflow as tf\n", + "from tensorflow.python.framework import ops\n", + "from cnn_utils import *\n", + "\n", + "%matplotlib inline\n", + "np.random.seed(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the next cell to load the \"SIGNS\" dataset you are going to use." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Loading the data (signs)\n", + "X_train_orig, Y_train_orig, X_test_orig, Y_test_orig, classes = load_dataset()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As a reminder, the SIGNS dataset is a collection of 6 signs representing numbers from 0 to 5.\n", + "\n", + "\n", + "\n", + "The next cell will show you an example of a labelled image in the dataset. Feel free to change the value of `index` below and re-run to see different examples. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "y = 2\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Example of a picture\n", + "index = 6\n", + "plt.imshow(X_train_orig[index])\n", + "print (\"y = \" + str(np.squeeze(Y_train_orig[:, index])))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In Course 2, you had built a fully-connected network for this dataset. But since this is an image dataset, it is more natural to apply a ConvNet to it.\n", + "\n", + "To get started, let's examine the shapes of your data. " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of training examples = 1080\n", + "number of test examples = 120\n", + "X_train shape: (1080, 64, 64, 3)\n", + "Y_train shape: (1080, 6)\n", + "X_test shape: (120, 64, 64, 3)\n", + "Y_test shape: (120, 6)\n" + ] + } + ], + "source": [ + "X_train = X_train_orig/255.\n", + "X_test = X_test_orig/255.\n", + "Y_train = convert_to_one_hot(Y_train_orig, 6).T\n", + "Y_test = convert_to_one_hot(Y_test_orig, 6).T\n", + "print (\"number of training examples = \" + str(X_train.shape[0]))\n", + "print (\"number of test examples = \" + str(X_test.shape[0]))\n", + "print (\"X_train shape: \" + str(X_train.shape))\n", + "print (\"Y_train shape: \" + str(Y_train.shape))\n", + "print (\"X_test shape: \" + str(X_test.shape))\n", + "print (\"Y_test shape: \" + str(Y_test.shape))\n", + "conv_layers = {}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "### 1.1 - Create placeholders\n", + "\n", + "TensorFlow requires that you create placeholders for the input data that will be fed into the model when running the session.\n", + "\n", + "**Exercise**: Implement the function below to create placeholders for the input image X and the output Y. You should not define the number of training examples for the moment. To do so, you could use \"None\" as the batch size, it will give you the flexibility to choose it later. Hence X should be of dimension **[None, n_H0, n_W0, n_C0]** and Y should be of dimension **[None, n_y]**. [Hint](https://www.tensorflow.org/api_docs/python/tf/placeholder)." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "# GRADED FUNCTION: create_placeholders\n", + "\n", + "def create_placeholders(n_H0, n_W0, n_C0, n_y):\n", + " \"\"\"\n", + " Creates the placeholders for the tensorflow session.\n", + " \n", + " Arguments:\n", + " n_H0 -- scalar, height of an input image\n", + " n_W0 -- scalar, width of an input image\n", + " n_C0 -- scalar, number of channels of the input\n", + " n_y -- scalar, number of classes\n", + " \n", + " Returns:\n", + " X -- placeholder for the data input, of shape [None, n_H0, n_W0, n_C0] and dtype \"float\"\n", + " Y -- placeholder for the input labels, of shape [None, n_y] and dtype \"float\"\n", + " \"\"\"\n", + "\n", + " ### START CODE HERE ### (≈2 lines)\n", + " X = tf.placeholder(dtype=tf.float32, shape=[None, n_H0, n_W0, n_C0], )\n", + " \n", + " Y = tf.placeholder(dtype=tf.float32, shape=[None, n_y])\n", + " \n", + " ### END CODE HERE ###\n", + " \n", + " return X, Y" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "X = Tensor(\"Placeholder:0\", shape=(?, 64, 64, 3), dtype=float32)\n", + "Y = Tensor(\"Placeholder_1:0\", shape=(?, 6), dtype=float32)\n" + ] + } + ], + "source": [ + "X, Y = create_placeholders(64, 64, 3, 6)\n", + "print (\"X = \" + str(X))\n", + "print (\"Y = \" + str(Y))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output**\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + " X = Tensor(\"Placeholder:0\", shape=(?, 64, 64, 3), dtype=float32)\n", + "\n", + "
\n", + " Y = Tensor(\"Placeholder_1:0\", shape=(?, 6), dtype=float32)\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.2 - Initialize parameters\n", + "\n", + "You will initialize weights/filters $W1$ and $W2$ using `tf.contrib.layers.xavier_initializer(seed = 0)`. You don't need to worry about bias variables as you will soon see that TensorFlow functions take care of the bias. Note also that you will only initialize the weights/filters for the conv2d functions. TensorFlow initializes the layers for the fully connected part automatically. We will talk more about that later in this assignment.\n", + "\n", + "**Exercise:** Implement initialize_parameters(). The dimensions for each group of filters are provided below. Reminder - to initialize a parameter $W$ of shape [1,2,3,4] in Tensorflow, use:\n", + "```python\n", + "W = tf.get_variable(\"W\", [1,2,3,4], initializer = ...)\n", + "```\n", + "[More Info](https://www.tensorflow.org/api_docs/python/tf/get_variable)." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# GRADED FUNCTION: initialize_parameters\n", + "# W -- Weights, array of shape (f, f, n_C_prev, n_C)\n", + "def initialize_parameters():\n", + " \"\"\"\n", + " Initializes weight parameters to build a neural network with tensorflow. The shapes are:\n", + " W1 : [4, 4, 3, 8]\n", + " W2 : [2, 2, 8, 16]\n", + " Returns:\n", + " parameters -- a dictionary of tensors containing W1, W2\n", + " \"\"\"\n", + " \n", + " tf.set_random_seed(1) # so that your \"random\" numbers match ours\n", + " \n", + " ### START CODE HERE ### (approx. 2 lines of code)\n", + " W1 = tf.get_variable('W1',[4,4,3,8], initializer=tf.contrib.layers.xavier_initializer(seed=0))\n", + " W2 = tf.get_variable('W2',[2,2,8,16], initializer=tf.contrib.layers.xavier_initializer(seed=0))\n", + " ### END CODE HERE ###\n", + "\n", + " parameters = {\"W1\": W1,\n", + " \"W2\": W2}\n", + " \n", + " return parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "W0525 12:36:15.099149 7976 lazy_loader.py:50] \n", + "The TensorFlow contrib module will not be included in TensorFlow 2.0.\n", + "For more information, please see:\n", + " * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n", + " * https://github.com/tensorflow/addons\n", + " * https://github.com/tensorflow/io (for I/O related ops)\n", + "If you depend on functionality not listed there, please file an issue.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "W1 = [ 0.00131723 0.1417614 -0.04434952 0.09197326 0.14984085 -0.03514394\n", + " -0.06847463 0.05245192]\n", + "W2 = [-0.08566415 0.17750949 0.11974221 0.16773748 -0.0830943 -0.08058\n", + " -0.00577033 -0.14643836 0.24162132 -0.05857408 -0.19055021 0.1345228\n", + " -0.22779644 -0.1601823 -0.16117483 -0.10286498]\n" + ] + } + ], + "source": [ + "tf.reset_default_graph()\n", + "with tf.Session() as sess_test:\n", + " parameters = initialize_parameters()\n", + " init = tf.global_variables_initializer()\n", + " sess_test.run(init)\n", + " print(\"W1 = \" + str(parameters[\"W1\"].eval()[1,1,1]))\n", + " print(\"W2 = \" + str(parameters[\"W2\"].eval()[1,1,1]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Expected Output:**\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " W1 = \n", + " \n", + "[ 0.00131723 0.14176141 -0.04434952 0.09197326 0.14984085 -0.03514394
\n", + " -0.06847463 0.05245192]\n", + "
\n", + " W2 = \n", + " \n", + "[-0.08566415 0.17750949 0.11974221 0.16773748 -0.0830943 -0.08058
\n", + " -0.00577033 -0.14643836 0.24162132 -0.05857408 -0.19055021 0.1345228
\n", + " -0.22779644 -0.1601823 -0.16117483 -0.10286498]\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.2 - Forward propagation\n", + "\n", + "In TensorFlow, there are built-in functions that carry out the convolution steps for you.\n", + "\n", + "- **tf.nn.conv2d(X,W1, strides = [1,s,s,1], padding = 'SAME'):** given an input $X$ and a group of filters $W1$, this function convolves $W1$'s filters on X. The third input ([1,f,f,1]) represents the strides for each dimension of the input (m, n_H_prev, n_W_prev, n_C_prev). You can read the full documentation [here](https://www.tensorflow.org/api_docs/python/tf/nn/conv2d)\n", + "\n", + "- **tf.nn.max_pool(A, ksize = [1,f,f,1], strides = [1,s,s,1], padding = 'SAME'):** given an input A, this function uses a window of size (f, f) and strides of size (s, s) to carry out max pooling over each window. You can read the full documentation [here](https://www.tensorflow.org/api_docs/python/tf/nn/max_pool)\n", + "\n", + "- **tf.nn.relu(Z1):** computes the elementwise ReLU of Z1 (which can be any shape). You can read the full documentation [here.](https://www.tensorflow.org/api_docs/python/tf/nn/relu)\n", + "\n", + "- **tf.contrib.layers.flatten(P)**: given an input P, this function flattens each example into a 1D vector it while maintaining the batch-size. It returns a flattened tensor with shape [batch_size, k]. You can read the full documentation [here.](https://www.tensorflow.org/api_docs/python/tf/contrib/layers/flatten)\n", + "\n", + "- **tf.contrib.layers.fully_connected(F, num_outputs):** given a the flattened input F, it returns the output computed using a fully connected layer. You can read the full documentation [here.](https://www.tensorflow.org/api_docs/python/tf/contrib/layers/fully_connected)\n", + "\n", + "In the last function above (`tf.contrib.layers.fully_connected`), the fully connected layer automatically initializes weights in the graph and keeps on training them as you train the model. Hence, you did not need to initialize those weights when initializing the parameters. \n", + "\n", + "\n", + "**Exercise**: \n", + "\n", + "Implement the `forward_propagation` function below to build the following model: `CONV2D -> RELU -> MAXPOOL -> CONV2D -> RELU -> MAXPOOL -> FLATTEN -> FULLYCONNECTED`. You should use the functions above. \n", + "\n", + "In detail, we will use the following parameters for all the steps:\n", + " - Conv2D: stride 1, padding is \"SAME\"\n", + " - ReLU\n", + " - Max pool: Use an 8 by 8 filter size and an 8 by 8 stride, padding is \"SAME\"\n", + " - Conv2D: stride 1, padding is \"SAME\"\n", + " - ReLU\n", + " - Max pool: Use a 4 by 4 filter size and a 4 by 4 stride, padding is \"SAME\"\n", + " - Flatten the previous output.\n", + " - FULLYCONNECTED (FC) layer: Apply a fully connected layer without an non-linear activation function. Do not call the softmax here. This will result in 6 neurons in the output layer, which then get passed later to a softmax. In TensorFlow, the softmax and cost function are lumped together into a single function, which you'll call in a different function when computing the cost. " + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "# GRADED FUNCTION: forward_propagation\n", + "\n", + "def forward_propagation(X, parameters, dropout=False):\n", + " \"\"\"\n", + " Implements the forward propagation for the model:\n", + " CONV2D -> RELU -> MAXPOOL -> CONV2D -> RELU -> MAXPOOL -> FLATTEN -> FULLYCONNECTED\n", + " \n", + " Arguments:\n", + " X -- input dataset placeholder, of shape (input size, number of examples)\n", + " parameters -- python dictionary containing your parameters \"W1\", \"W2\"\n", + " the shapes are given in initialize_parameters\n", + "\n", + " Returns:\n", + " Z3 -- the output of the last LINEAR unit\n", + " \"\"\"\n", + " \n", + " # Retrieve the parameters from the dictionary \"parameters\" \n", + " W1 = parameters['W1']\n", + " W2 = parameters['W2']\n", + " \n", + " ### START CODE HERE ###\n", + " # CONV2D: stride of 1, padding 'SAME'\n", + " Z1 = tf.nn.conv2d(X, W1, strides=[1,1,1,1], padding='SAME' )\n", + " # RELU\n", + " A1 = tf.nn.relu(Z1)\n", + " # MAXPOOL: window 8x8, sride 8, padding 'SAME'\n", + " P1 = tf.nn.max_pool(A1,ksize=[1,8,8,1] ,strides=[1,8,8,1], padding='SAME' )\n", + " # CONV2D: filters W2, stride 1, padding 'SAME'\n", + " Z2 = tf.nn.conv2d(P1, W2, strides=[1,1,1,1], padding='SAME')\n", + " # RELU\n", + " A2 = tf.nn.relu(Z2)\n", + " # MAXPOOL: window 4x4, stride 4, padding 'SAME'\n", + " P2 = tf.nn.max_pool(A2,ksize=[1,4,4,1], strides=[1,4,4,1], padding='SAME')\n", + " # FLATTEN\n", + " P2 = tf.contrib.layers.flatten(P2)\n", + " \n", + " # FULLY-CONNECTED without non-linear activation function (not not call softmax).\n", + " # 6 neurons in output layer. Hint: one of the arguments should be \"activation_fn=None\" \n", + " if dropout:\n", + " P2=tf.contrib.layers.fully_connected(P2,90, activation_fn=tf.nn.relu)\n", + " P2=tf.nn.dropout(P2, rate=0.2)\n", + " Z3 = tf.contrib.layers.fully_connected(P2, 6, activation_fn=None)\n", + " ### END CODE HERE ###\n", + "\n", + " return Z3" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "W0525 13:07:36.688151 7976 deprecation.py:323] From E:\\Anaconda\\lib\\site-packages\\tensorflow\\contrib\\layers\\python\\layers\\layers.py:1634: flatten (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Use keras.layers.flatten instead.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Z3 = [[ 1.4416982 -0.24909636 5.450499 -0.26189643 -0.20669901 1.3654672 ]\n", + " [ 1.4070845 -0.02573219 5.08928 -0.48669913 -0.40940714 1.2624855 ]]\n" + ] + } + ], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "with tf.Session() as sess:\n", + " np.random.seed(1)\n", + " X, Y = create_placeholders(64, 64, 3, 6)\n", + " parameters = initialize_parameters()\n", + " Z3 = forward_propagation(X, parameters)\n", + " init = tf.global_variables_initializer()\n", + " sess.run(init)\n", + " a = sess.run(Z3, {X: np.random.randn(2,64,64,3), Y: np.random.randn(2,6)})\n", + " print(\"Z3 = \" + str(a))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output**:\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " Z3 =\n", + " \n", + " [[-0.44670227 -1.57208765 -1.53049231 -2.31013036 -1.29104376 0.46852064]
\n", + " [-0.17601591 -1.57972014 -1.4737016 -2.61672091 -1.00810647 0.5747785 ]]\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.3 - Compute cost\n", + "\n", + "Implement the compute cost function below. You might find these two functions helpful: \n", + "\n", + "- **tf.nn.softmax_cross_entropy_with_logits(logits = Z3, labels = Y):** computes the softmax entropy loss. This function both computes the softmax activation function as well as the resulting loss. You can check the full documentation [here.](https://www.tensorflow.org/api_docs/python/tf/nn/softmax_cross_entropy_with_logits)\n", + "- **tf.reduce_mean:** computes the mean of elements across dimensions of a tensor. Use this to sum the losses over all the examples to get the overall cost. You can check the full documentation [here.](https://www.tensorflow.org/api_docs/python/tf/reduce_mean)\n", + "\n", + "** Exercise**: Compute the cost below using the function above." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# GRADED FUNCTION: compute_cost \n", + "\n", + "def compute_cost(Z3, Y):\n", + " \"\"\"\n", + " Computes the cost\n", + " \n", + " Arguments:\n", + " Z3 -- output of forward propagation (output of the last LINEAR unit), of shape (6, number of examples)\n", + " Y -- \"true\" labels vector placeholder, same shape as Z3\n", + " \n", + " Returns:\n", + " cost - Tensor of the cost function\n", + " \"\"\"\n", + " \n", + " ### START CODE HERE ### (1 line of code)\n", + " cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=Z3, labels=Y) )\n", + " ### END CODE HERE ###\n", + " \n", + " return cost" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "W0525 13:11:38.626000 7976 deprecation.py:323] From :16: softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "\n", + "Future major versions of TensorFlow will allow gradients to flow\n", + "into the labels input on backprop by default.\n", + "\n", + "See `tf.nn.softmax_cross_entropy_with_logits_v2`.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cost = 4.6648703\n" + ] + } + ], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "with tf.Session() as sess:\n", + " np.random.seed(1)\n", + " X, Y = create_placeholders(64, 64, 3, 6)\n", + " parameters = initialize_parameters()\n", + " Z3 = forward_propagation(X, parameters)\n", + " cost = compute_cost(Z3, Y)\n", + " init = tf.global_variables_initializer()\n", + " sess.run(init)\n", + " a = sess.run(cost, {X: np.random.randn(4,64,64,3), Y: np.random.randn(4,6)})\n", + " print(\"cost = \" + str(a))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output**: \n", + "\n", + "\n", + " \n", + " \n", + "
\n", + " cost =\n", + " \n", + " 2.91034\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.4 Model \n", + "\n", + "Finally you will merge the helper functions you implemented above to build a model. You will train it on the SIGNS dataset. \n", + "\n", + "You have implemented `random_mini_batches()` in the Optimization programming assignment of course 2. Remember that this function returns a list of mini-batches. \n", + "\n", + "**Exercise**: Complete the function below. \n", + "\n", + "The model below should:\n", + "\n", + "- create placeholders\n", + "- initialize parameters\n", + "- forward propagate\n", + "- compute the cost\n", + "- create an optimizer\n", + "\n", + "Finally you will create a session and run a for loop for num_epochs, get the mini-batches, and then for each mini-batch you will optimize the function. [Hint for initializing the variables](https://www.tensorflow.org/api_docs/python/tf/global_variables_initializer)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "# GRADED FUNCTION: model\n", + "\n", + "def model(X_train, Y_train, X_test, Y_test, learning_rate = 0.009,\n", + " num_epochs = 65, minibatch_size = 64, print_cost = True, dropout=False):\n", + " \"\"\"\n", + " Implements a three-layer ConvNet in Tensorflow:\n", + " CONV2D -> RELU -> MAXPOOL -> CONV2D -> RELU -> MAXPOOL -> FLATTEN -> FULLYCONNECTED\n", + " \n", + " Arguments:\n", + " X_train -- training set, of shape (None, 64, 64, 3)\n", + " Y_train -- test set, of shape (None, n_y = 6)\n", + " X_test -- training set, of shape (None, 64, 64, 3)\n", + " Y_test -- test set, of shape (None, n_y = 6)\n", + " learning_rate -- learning rate of the optimization\n", + " num_epochs -- number of epochs of the optimization loop\n", + " minibatch_size -- size of a minibatch\n", + " print_cost -- True to print the cost every 100 epochs\n", + " \n", + " Returns:\n", + " train_accuracy -- real number, accuracy on the train set (X_train)\n", + " test_accuracy -- real number, testing accuracy on the test set (X_test)\n", + " parameters -- parameters learnt by the model. They can then be used to predict.\n", + " \"\"\"\n", + " \n", + " ops.reset_default_graph() # to be able to rerun the model without overwriting tf variables\n", + " tf.set_random_seed(1) # to keep results consistent (tensorflow seed)\n", + " seed = 3 # to keep results consistent (numpy seed)\n", + " (m, n_H0, n_W0, n_C0) = X_train.shape \n", + " n_y = Y_train.shape[1] \n", + " costs = [] # To keep track of the cost\n", + " \n", + " # Create Placeholders of the correct shape\n", + " ### START CODE HERE ### (1 line)\n", + " X,Y= create_placeholders(n_H0, n_W0, n_C0, n_y)\n", + " ### END CODE HERE ###\n", + "\n", + " # Initialize parameters\n", + " ### START CODE HERE ### (1 line)\n", + " parameters = initialize_parameters()\n", + " ### END CODE HERE ###\n", + " \n", + " # Forward propagation: Build the forward propagation in the tensorflow graph\n", + " ### START CODE HERE ### (1 line)\n", + " Z3 = forward_propagation(X, parameters, dropout=dropout)\n", + " ### END CODE HERE ###\n", + " \n", + " # Cost function: Add cost function to tensorflow graph\n", + " ### START CODE HERE ### (1 line)\n", + " cost = compute_cost(Z3, Y)\n", + " ### END CODE HERE ###\n", + " \n", + " # Backpropagation: Define the tensorflow optimizer. Use an AdamOptimizer that minimizes the cost.\n", + " ### START CODE HERE ### (1 line)\n", + " optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)\n", + " ### END CODE HERE ###\n", + " \n", + " # Initialize all the variables globally\n", + " init = tf.global_variables_initializer()\n", + " \n", + " # Start the session to compute the tensorflow graph\n", + " with tf.Session() as sess:\n", + " \n", + " # Run the initialization\n", + " sess.run(init)\n", + " \n", + " # Do the training loop\n", + " for epoch in range(num_epochs):\n", + "\n", + " minibatch_cost = 0.\n", + " num_minibatches = int(m / minibatch_size) # number of minibatches of size minibatch_size in the train set\n", + " seed = seed + 1\n", + " minibatches = random_mini_batches(X_train, Y_train, minibatch_size, seed)\n", + "\n", + " for minibatch in minibatches:\n", + "\n", + " # Select a minibatch\n", + " (minibatch_X, minibatch_Y) = minibatch\n", + " # IMPORTANT: The line that runs the graph on a minibatch.\n", + " # Run the session to execute the optimizer and the cost, the feedict should contain a minibatch for (X,Y).\n", + " ### START CODE HERE ### (1 line)\n", + " _ , temp_cost = sess.run([optimizer, cost], {X:X_train, Y:Y_train})\n", + " ### END CODE HERE ###\n", + " \n", + " minibatch_cost += temp_cost / num_minibatches\n", + " \n", + "\n", + " # Print the cost every epoch\n", + " if print_cost == True and epoch % 5 == 0:\n", + " print (\"Cost after epoch %i: %f\" % (epoch, minibatch_cost))\n", + " if print_cost == True and epoch % 1 == 0:\n", + " costs.append(minibatch_cost)\n", + " \n", + " \n", + " # plot the cost\n", + " plt.plot(np.squeeze(costs))\n", + " plt.ylabel('cost')\n", + " plt.xlabel('iterations (per tens)')\n", + " plt.title(\"Learning rate =\" + str(learning_rate))\n", + " plt.show()\n", + "\n", + " # Calculate the correct predictions\n", + " predict_op = tf.argmax(Z3, 1)\n", + " correct_prediction = tf.equal(predict_op, tf.argmax(Y, 1))\n", + " \n", + " # Calculate accuracy on the test set\n", + " accuracy = tf.reduce_mean(tf.cast(correct_prediction, \"float\"))\n", + " print(accuracy)\n", + " train_accuracy = accuracy.eval({X: X_train, Y: Y_train})\n", + " test_accuracy = accuracy.eval({X: X_test, Y: Y_test})\n", + " print(\"Train Accuracy:\", train_accuracy)\n", + " print(\"Test Accuracy:\", test_accuracy)\n", + " \n", + " return train_accuracy, test_accuracy, parameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the following cell to train your model for 100 epochs. Check if your cost after epoch 0 and 5 matches our output. If not, stop the cell and go back to your code!" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cost after epoch 0: 1.918536\n", + "Cost after epoch 5: 1.024695\n", + "Cost after epoch 10: 0.624790\n", + "Cost after epoch 15: 0.485562\n", + "Cost after epoch 20: 0.373167\n", + "Cost after epoch 25: 0.316183\n", + "Cost after epoch 30: 0.280050\n", + "Cost after epoch 35: 0.225940\n", + "Cost after epoch 40: 0.197010\n", + "Cost after epoch 45: 0.169021\n", + "Cost after epoch 50: 0.156169\n", + "Cost after epoch 55: 0.137946\n", + "Cost after epoch 60: 0.121304\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tensor(\"Mean_1:0\", shape=(), dtype=float32)\n", + "Train Accuracy: 0.97407407\n", + "Test Accuracy: 0.875\n" + ] + } + ], + "source": [ + "_, _, parameters = model(X_train, Y_train, X_test, Y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected output**: although it may not match perfectly, your expected output should be close to ours and your cost value should decrease.\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " **Cost after epoch 0 =**\n", + " \n", + " 1.917929\n", + "
\n", + " **Cost after epoch 5 =**\n", + " \n", + " 1.506757\n", + "
\n", + " **Train Accuracy =**\n", + " \n", + " 0.940741\n", + "
\n", + " **Test Accuracy =**\n", + " \n", + " 0.783333\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Congratulations! You have finised the assignment and built a model that recognizes SIGN language with almost 80% accuracy on the test set. If you wish, feel free to play around with this dataset further. You can actually improve its accuracy by spending more time tuning the hyperparameters, or using regularization (as this model clearly has a high variance). \n", + "\n", + "Once again, here's a thumbs up for your work! " + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "E:\\Anaconda\\lib\\site-packages\\ipykernel_launcher.py:2: DeprecationWarning: `imread` is deprecated!\n", + "`imread` is deprecated in SciPy 1.0.0.\n", + "Use ``matplotlib.pyplot.imread`` instead.\n", + " \n", + "E:\\Anaconda\\lib\\site-packages\\ipykernel_launcher.py:3: DeprecationWarning: `imresize` is deprecated!\n", + "`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.\n", + "Use ``skimage.transform.resize`` instead.\n", + " This is separate from the ipykernel package so we can avoid doing imports until\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fname = \"images/thumbs_up.jpg\"\n", + "image = np.array(ndimage.imread(fname, flatten=False))\n", + "my_image = scipy.misc.imresize(image, size=(64,64))\n", + "plt.imshow(my_image)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Aditional part" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cost after epoch 0: 1.897077\n", + "Cost after epoch 5: 0.574760\n", + "Cost after epoch 10: 0.200760\n", + "Cost after epoch 15: 0.091636\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tensor(\"Mean_1:0\", shape=(), dtype=float32)\n", + "Train Accuracy: 0.975\n", + "Test Accuracy: 0.89166665\n" + ] + } + ], + "source": [ + "# Let's add dropout layer into model and repeat experiment\n", + "_, _, parameters = model(X_train, Y_train, X_test, Y_test, dropout=True, num_epochs = 20)" + ] + } + ], + "metadata": { + "coursera": { + "course_slug": "convolutional-neural-networks", + "graded_item_id": "bwbJV", + "launcher_item_id": "0TkXB" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/cnn/CNN_lesson.ipynb b/cnn/CNN_lesson.ipynb new file mode 100644 index 0000000..b73c48d --- /dev/null +++ b/cnn/CNN_lesson.ipynb @@ -0,0 +1,863 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from scipy import misc\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "%matplotlib inline\n", + "\n", + "\n", + "img = misc.ascent()\n", + "plt.gray()\n", + "plt.imshow(img)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "filter = np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]])\n", + "# filter = filter.T\n", + "\n", + "\n", + "features = img.copy()\n", + "for i in range(1, img.shape[0] - 1):\n", + " for j in range(1, img.shape[1] - 1):\n", + " conv = img[i - 1, j - 1] * filter[0][0]\n", + " conv += img[i - 1, j] * filter[0][1]\n", + " conv += img[i - 1, j + 1] * filter[0][2]\n", + " conv += img[i, j - 1] * filter[1][0]\n", + " conv += img[i, j] * filter[1][1]\n", + " conv += img[i, j + 1] * filter[1][2]\n", + " conv += img[i + 1, j - 1] * filter[2][0]\n", + " conv += img[i + 1, j] * filter[2][1]\n", + " conv += img[i + 1, j + 1] * filter[2][2]\n", + " \n", + " conv = 0. if min(conv, 255.) < 0 else min(conv, 255.) \n", + " \n", + " features[i, j] = conv\n", + " \n", + "plt.imshow(features) " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "pool_img = np.zeros((img.shape[0] // 2, img.shape[1] // 2))\n", + "\n", + "for i in range(0, features.shape[0], 2):\n", + " for j in range(0, features.shape[1], 2):\n", + " pool_img[i // 2, j // 2] = np.max(features[i:i+2, j:j+2])\n", + " \n", + "plt.imshow(pool_img) " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: Logging before flag parsing goes to stderr.\n", + "W0528 09:08:06.997658 6852 deprecation.py:506] From E:\\Anaconda\\lib\\site-packages\\tensorflow\\python\\ops\\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Call initializer instance with the dtype argument instead of passing it to the constructor\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/5\n", + "60000/60000 [==============================] - 4s 67us/sample - loss: 0.7327 - acc: 0.7620\n", + "Epoch 2/5\n", + "60000/60000 [==============================] - 4s 61us/sample - loss: 0.5099 - acc: 0.8270\n", + "Epoch 3/5\n", + "60000/60000 [==============================] - 4s 61us/sample - loss: 0.4664 - acc: 0.8389\n", + "Epoch 4/5\n", + "60000/60000 [==============================] - 4s 62us/sample - loss: 0.4416 - acc: 0.8465\n", + "Epoch 5/5\n", + "60000/60000 [==============================] - 4s 66us/sample - loss: 0.4251 - acc: 0.8525\n", + "10000/10000 [==============================] - 0s 41us/sample - loss: 0.4632 - acc: 0.8354\n" + ] + }, + { + "data": { + "text/plain": [ + "[0.46315675230026243, 0.8354]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import tensorflow as tf\n", + "\n", + "\n", + "(x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()\n", + "x_train = x_train / 255.\n", + "x_test = x_test / 255.\n", + "\n", + "model = tf.keras.Sequential([\n", + " tf.keras.layers.Flatten(),\n", + " tf.keras.layers.Dense(128, activation=tf.nn.relu),\n", + " tf.keras.layers.Dense(10, activation='softmax'),\n", + "])\n", + "model.compile(optimizer='sgd', loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n", + "\n", + "history = model.fit(x_train, y_train, epochs=5)\n", + "model.evaluate(x_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(history.history['loss'])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/5\n", + "60000/60000 [==============================] - 48s 808us/sample - loss: 0.8787 - acc: 0.6826\n", + "Epoch 2/5\n", + "60000/60000 [==============================] - 46s 766us/sample - loss: 0.5463 - acc: 0.7960\n", + "Epoch 3/5\n", + "60000/60000 [==============================] - 48s 797us/sample - loss: 0.4764 - acc: 0.8256\n", + "Epoch 4/5\n", + "60000/60000 [==============================] - 47s 791us/sample - loss: 0.4315 - acc: 0.8427\n", + "Epoch 5/5\n", + "60000/60000 [==============================] - 52s 873us/sample - loss: 0.4006 - acc: 0.8546\n", + "10000/10000 [==============================] - 3s 282us/sample - loss: 0.4088 - acc: 0.8507\n" + ] + }, + { + "data": { + "text/plain": [ + "[0.4087753909111023, 0.8507]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x_train = x_train.reshape(60000, 28, 28, 1)\n", + "x_test = x_test.reshape(10000, 28, 28, 1)\n", + "\n", + "model = tf.keras.Sequential([\n", + " tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),\n", + " tf.keras.layers.MaxPool2D(2, 2),\n", + " tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),\n", + " tf.keras.layers.MaxPool2D(2, 2),\n", + " tf.keras.layers.Flatten(),\n", + " tf.keras.layers.Dense(128, activation=tf.nn.relu),\n", + " tf.keras.layers.Dense(10, activation='softmax'),\n", + "])\n", + "model.compile(optimizer='sgd', loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n", + "\n", + "history = model.fit(x_train, y_train, epochs=5)\n", + "model.evaluate(x_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total time execution 241 seconds\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "print(f'Total time execution {(48+46+48+47+52)} seconds')\n", + "plt.title(f'n_convolutions={32}')\n", + "plt.plot(history.history['loss'])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "EXERCISES\n", + "===\n", + "\n", + " - Try editing the convolutions. Change the 32s to either 16 or 64. What impact will this have on accuracy and/or training time.\n", + " - Remove the final Convolution. What impact will this have on accuracy or training time?\n", + " - How about adding more Convolutions? What impact do you think this will have? Experiment with it.\n", + " - Remove all Convolutions but the first. What impact do you think this will have? Experiment with it.\n", + " - In the previous lesson you implemented a callback to check on the loss function and to cancel training once it hit a certain amount. See if you can implement that here!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.Try editing the convolutions. Change the 32s to either 16 or 64. What impact will this have on accuracy and/or training time." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def my_model(x_train,y_train,x_test,y_test,n_convolutions=32):\n", + " model = tf.keras.Sequential([\n", + " tf.keras.layers.Conv2D(n_convolutions, (3, 3), activation='relu', input_shape=(28, 28, 1)),\n", + " tf.keras.layers.MaxPool2D(2, 2),\n", + " tf.keras.layers.Conv2D(n_convolutions, (3, 3), activation='relu'),\n", + " tf.keras.layers.MaxPool2D(2, 2),\n", + " tf.keras.layers.Flatten(),\n", + " tf.keras.layers.Dense(128, activation=tf.nn.relu),\n", + " tf.keras.layers.Dense(10, activation='softmax'),\n", + " ])\n", + " model.compile(optimizer='sgd', loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n", + "\n", + " history = model.fit(x_train, y_train, epochs=5)\n", + " print('Model evaluate',model.evaluate(x_test, y_test))\n", + " plt.title(f'n_convolutions={n_convolutions}')\n", + " plt.plot(history.history['loss'])\n", + " return model, history" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/5\n", + "60000/60000 [==============================] - 28s 469us/sample - loss: 0.8446 - acc: 0.6945\n", + "Epoch 2/5\n", + "60000/60000 [==============================] - 28s 468us/sample - loss: 0.5455 - acc: 0.7986\n", + "Epoch 3/5\n", + "60000/60000 [==============================] - 28s 464us/sample - loss: 0.4746 - acc: 0.8257\n", + "Epoch 4/5\n", + "60000/60000 [==============================] - 28s 469us/sample - loss: 0.4341 - acc: 0.8417\n", + "Epoch 5/5\n", + "60000/60000 [==============================] - 30s 495us/sample - loss: 0.4065 - acc: 0.8527\n", + "10000/10000 [==============================] - 2s 198us/sample - loss: 0.4191 - acc: 0.8499\n", + "Model evaluate [0.41913245186805725, 0.8499]\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "model_16, history_16 = my_model(x_train,y_train,x_test,y_test,n_convolutions=16)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total time execution for n_filters=16 142 seconds\n" + ] + } + ], + "source": [ + "print(f'Total time execution for n_filters=16 {(28+28+28+28+30)} seconds')" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/5\n", + "60000/60000 [==============================] - 107s 2ms/sample - loss: 0.8770 - acc: 0.6848\n", + "Epoch 2/5\n", + "60000/60000 [==============================] - 105s 2ms/sample - loss: 0.5388 - acc: 0.7997\n", + "Epoch 3/5\n", + "60000/60000 [==============================] - 105s 2ms/sample - loss: 0.4628 - acc: 0.8318\n", + "Epoch 4/5\n", + "60000/60000 [==============================] - 105s 2ms/sample - loss: 0.4192 - acc: 0.8467\n", + "Epoch 5/5\n", + "60000/60000 [==============================] - 106s 2ms/sample - loss: 0.3871 - acc: 0.8595\n", + "10000/10000 [==============================] - 5s 517us/sample - loss: 0.3969 - acc: 0.8566\n", + "Model evaluate [0.39689865803718566, 0.8566]\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "model_64, history_64 = my_model(x_train,y_train,x_test,y_test,n_convolutions=64)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total time execution for n_filters=16 528 seconds\n" + ] + } + ], + "source": [ + "print(f'Total time execution for n_filters=16 {(107+105+105+105+106)} seconds')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Report for different number of convolution filters (16, 32, 64)\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " n_filters\n", + " \n", + " Accuracy\n", + " \n", + " Time (sec)\n", + "
\n", + " 16\n", + " \n", + " 0.8499\n", + " \n", + " 142\n", + "
\n", + " 32\n", + " \n", + " 0.8507\n", + " \n", + " 241\n", + "
\n", + " 64\n", + " \n", + " 0.8566\n", + " \n", + " 528\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.Remove the final Convolution. What impact will this have on accuracy or training time?" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "def my_model(x_train,y_train,x_test,y_test,n_convolutions=32):\n", + " model = tf.keras.Sequential([\n", + " tf.keras.layers.Conv2D(n_convolutions, (3, 3), activation='relu', input_shape=(28, 28, 1)),\n", + " tf.keras.layers.MaxPool2D(2, 2),\n", + " #tf.keras.layers.Conv2D(n_convolutions, (3, 3), activation='relu'),\n", + " #tf.keras.layers.MaxPool2D(2, 2),\n", + " tf.keras.layers.Flatten(),\n", + " tf.keras.layers.Dense(128, activation=tf.nn.relu),\n", + " tf.keras.layers.Dense(10, activation='softmax'),\n", + " ])\n", + " model.compile(optimizer='sgd', loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n", + "\n", + " history = model.fit(x_train, y_train, epochs=5)\n", + " print('Model evaluate (1 conv_2d layer)',model.evaluate(x_test, y_test))\n", + " plt.title(f'n_convolutions={n_convolutions}')\n", + " plt.plot(history.history['loss'])\n", + " return model, history" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/5\n", + "60000/60000 [==============================] - 39s 650us/sample - loss: 0.6920 - acc: 0.7566\n", + "Epoch 2/5\n", + "60000/60000 [==============================] - 40s 660us/sample - loss: 0.4868 - acc: 0.8260\n", + "Epoch 3/5\n", + "60000/60000 [==============================] - 53s 886us/sample - loss: 0.4327 - acc: 0.8470\n", + "Epoch 4/5\n", + "60000/60000 [==============================] - 41s 675us/sample - loss: 0.4009 - acc: 0.8579\n", + "Epoch 5/5\n", + "60000/60000 [==============================] - 39s 658us/sample - loss: 0.3709 - acc: 0.8692\n", + "10000/10000 [==============================] - 2s 216us/sample - loss: 0.3824 - acc: 0.8622\n", + "Model evaluate (1 conv_2d layer) [0.38239504699707033, 0.8622]\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "model_1conv, history_1conv = my_model(x_train,y_train,x_test,y_test,n_convolutions=32)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total time execution 212\n" + ] + } + ], + "source": [ + "print(f'Total time execution {39+40+53+41+39}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Outcome: If we use only one convolutian layer accuracy increses and equals 0.8622 (time execution 212 sec)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.How about adding more Convolutions? What impact do you think this will have? Experiment with it." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "def my_model_3(x_train,y_train,x_test,y_test,n_convolutions=32):\n", + " model = tf.keras.Sequential([\n", + " tf.keras.layers.Conv2D(n_convolutions, (3, 3), activation='relu', input_shape=(28, 28, 1)),\n", + " tf.keras.layers.MaxPool2D(2, 2),\n", + " tf.keras.layers.Conv2D(n_convolutions, (3, 3), activation='relu'),\n", + " tf.keras.layers.MaxPool2D(2, 2),\n", + " tf.keras.layers.Conv2D(n_convolutions, (3, 3), activation='relu'),\n", + " tf.keras.layers.MaxPool2D(2, 2),\n", + " tf.keras.layers.Flatten(),\n", + " tf.keras.layers.Dense(128, activation=tf.nn.relu),\n", + " tf.keras.layers.Dense(10, activation='softmax'),\n", + " ])\n", + " model.compile(optimizer='sgd', loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n", + "\n", + " history = model.fit(x_train, y_train, epochs=5)\n", + " print('Model evaluate (1 conv_2d layer)',model.evaluate(x_test, y_test))\n", + " plt.title(f'n_convolutions={n_convolutions}')\n", + " plt.plot(history.history['loss'])\n", + " return model, history" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/5\n", + "60000/60000 [==============================] - 52s 866us/sample - loss: 1.3171 - acc: 0.5163\n", + "Epoch 2/5\n", + "60000/60000 [==============================] - 53s 877us/sample - loss: 0.7443 - acc: 0.7265\n", + "Epoch 3/5\n", + "60000/60000 [==============================] - 52s 872us/sample - loss: 0.6418 - acc: 0.7644\n", + "Epoch 4/5\n", + "60000/60000 [==============================] - 50s 840us/sample - loss: 0.5768 - acc: 0.7863\n", + "Epoch 5/5\n", + "60000/60000 [==============================] - 50s 841us/sample - loss: 0.5382 - acc: 0.8015\n", + "10000/10000 [==============================] - 3s 280us/sample - loss: 0.5354 - acc: 0.8044\n", + "Model evaluate (1 conv_2d layer) [0.5353573750972748, 0.8044]\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "model_3, history_3 = my_model_3(x_train,y_train,x_test,y_test,n_convolutions=32)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total time execution 257\n" + ] + } + ], + "source": [ + "print(f'Total time execution {52+53+52+50+50}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Table of accuracy for different number of conv layers (convolution filters = 32)\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " n_conv_layers\n", + " \n", + " Accuracy\n", + " \n", + " Time (sec)\n", + "
\n", + " 1\n", + " \n", + " 0.8622\n", + " \n", + " 212\n", + "
\n", + " 2\n", + " \n", + " 0.8507\n", + " \n", + " 241\n", + "
\n", + " 3\n", + " \n", + " 0.8044\n", + " \n", + " 257\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Callback to check on the loss function and to cancel training once it hit a certain amount" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "from tensorflow.keras import callbacks" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train on 60000 samples, validate on 10000 samples\n", + "Epoch 1/30\n", + "60000/60000 [==============================] - 59s 977us/sample - loss: 0.9093 - acc: 0.6723 - val_loss: 0.5892 - val_acc: 0.7794\n", + "Epoch 2/30\n", + "60000/60000 [==============================] - 57s 943us/sample - loss: 0.5277 - acc: 0.8045 - val_loss: 0.5143 - val_acc: 0.8090\n", + "Epoch 3/30\n", + "60000/60000 [==============================] - 59s 975us/sample - loss: 0.4614 - acc: 0.8329 - val_loss: 0.4520 - val_acc: 0.8385\n", + "Epoch 4/30\n", + "60000/60000 [==============================] - 55s 911us/sample - loss: 0.4208 - acc: 0.8474 - val_loss: 0.4320 - val_acc: 0.8432\n", + "Epoch 5/30\n", + "60000/60000 [==============================] - 55s 919us/sample - loss: 0.3934 - acc: 0.8585 - val_loss: 0.4002 - val_acc: 0.8563\n", + "Epoch 6/30\n", + "60000/60000 [==============================] - 54s 893us/sample - loss: 0.3726 - acc: 0.8637 - val_loss: 0.3825 - val_acc: 0.8615\n", + "Epoch 7/30\n", + "60000/60000 [==============================] - 54s 904us/sample - loss: 0.3555 - acc: 0.8699 - val_loss: 0.3671 - val_acc: 0.8661\n", + "Epoch 8/30\n", + "60000/60000 [==============================] - 52s 871us/sample - loss: 0.3408 - acc: 0.8769 - val_loss: 0.3624 - val_acc: 0.8700\n", + "Epoch 00008: early stopping\n" + ] + } + ], + "source": [ + "x_train = x_train.reshape(60000, 28, 28, 1)\n", + "x_test = x_test.reshape(10000, 28, 28, 1)\n", + "\n", + "es=callbacks.EarlyStopping(monitor='val_loss',verbose=1, min_delta=0.01, mode='min')\n", + "\n", + "model = tf.keras.Sequential([\n", + " tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),\n", + " tf.keras.layers.MaxPool2D(2, 2),\n", + " tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),\n", + " tf.keras.layers.MaxPool2D(2, 2),\n", + " tf.keras.layers.Flatten(),\n", + " tf.keras.layers.Dense(128, activation=tf.nn.relu),\n", + " tf.keras.layers.Dense(10, activation='softmax'),\n", + "])\n", + "model.compile(optimizer='sgd', loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n", + "\n", + "history = model.fit(x_train, y_train, epochs=30, validation_data=(x_test, y_test), callbacks=[es,])\n", + "#model.evaluate(x_test, y_test)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/cnn/CNN_step_by_step.ipynb b/cnn/CNN_step_by_step.ipynb new file mode 100644 index 0000000..6d581e1 --- /dev/null +++ b/cnn/CNN_step_by_step.ipynb @@ -0,0 +1,1375 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Convolutional Neural Networks: Step by Step\n", + "\n", + "Welcome to Course 4's first assignment! In this assignment, you will implement convolutional (CONV) and pooling (POOL) layers in numpy, including both forward propagation and (optionally) backward propagation. \n", + "\n", + "**Notation**:\n", + "- Superscript $[l]$ denotes an object of the $l^{th}$ layer. \n", + " - Example: $a^{[4]}$ is the $4^{th}$ layer activation. $W^{[5]}$ and $b^{[5]}$ are the $5^{th}$ layer parameters.\n", + "\n", + "\n", + "- Superscript $(i)$ denotes an object from the $i^{th}$ example. \n", + " - Example: $x^{(i)}$ is the $i^{th}$ training example input.\n", + " \n", + " \n", + "- Lowerscript $i$ denotes the $i^{th}$ entry of a vector.\n", + " - Example: $a^{[l]}_i$ denotes the $i^{th}$ entry of the activations in layer $l$, assuming this is a fully connected (FC) layer.\n", + " \n", + " \n", + "- $n_H$, $n_W$ and $n_C$ denote respectively the height, width and number of channels of a given layer. If you want to reference a specific layer $l$, you can also write $n_H^{[l]}$, $n_W^{[l]}$, $n_C^{[l]}$. \n", + "- $n_{H_{prev}}$, $n_{W_{prev}}$ and $n_{C_{prev}}$ denote respectively the height, width and number of channels of the previous layer. If referencing a specific layer $l$, this could also be denoted $n_H^{[l-1]}$, $n_W^{[l-1]}$, $n_C^{[l-1]}$. \n", + "\n", + "We assume that you are already familiar with `numpy` and/or have completed the previous courses of the specialization. Let's get started!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1 - Packages\n", + "\n", + "Let's first import all the packages that you will need during this assignment. \n", + "- [numpy](www.numpy.org) is the fundamental package for scientific computing with Python.\n", + "- [matplotlib](http://matplotlib.org) is a library to plot graphs in Python.\n", + "- np.random.seed(1) is used to keep all the random function calls consistent. It will help us grade your work." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import h5py\n", + "import matplotlib.pyplot as plt\n", + "\n", + "%matplotlib inline\n", + "plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots\n", + "plt.rcParams['image.interpolation'] = 'nearest'\n", + "plt.rcParams['image.cmap'] = 'gray'\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "np.random.seed(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2 - Outline of the Assignment\n", + "\n", + "You will be implementing the building blocks of a convolutional neural network! Each function you will implement will have detailed instructions that will walk you through the steps needed:\n", + "\n", + "- Convolution functions, including:\n", + " - Zero Padding\n", + " - Convolve window \n", + " - Convolution forward\n", + " - Convolution backward (optional)\n", + "- Pooling functions, including:\n", + " - Pooling forward\n", + " - Create mask \n", + " - Distribute value\n", + " - Pooling backward (optional)\n", + " \n", + "This notebook will ask you to implement these functions from scratch in `numpy`. In the next notebook, you will use the TensorFlow equivalents of these functions to build the following model:\n", + "\n", + "\n", + "\n", + "**Note** that for every forward function, there is its corresponding backward equivalent. Hence, at every step of your forward module you will store some parameters in a cache. These parameters are used to compute gradients during backpropagation. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3 - Convolutional Neural Networks\n", + "\n", + "Although programming frameworks make convolutions easy to use, they remain one of the hardest concepts to understand in Deep Learning. A convolution layer transforms an input volume into an output volume of different size, as shown below. \n", + "\n", + "\n", + "\n", + "In this part, you will build every step of the convolution layer. You will first implement two helper functions: one for zero padding and the other for computing the convolution function itself. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.1 - Zero-Padding\n", + "\n", + "Zero-padding adds zeros around the border of an image:\n", + "\n", + "\n", + "
**Figure 1** : **Zero-Padding**
Image (3 channels, RGB) with a padding of 2.
\n", + "\n", + "The main benefits of padding are the following:\n", + "\n", + "- It allows you to use a CONV layer without necessarily shrinking the height and width of the volumes. This is important for building deeper networks, since otherwise the height/width would shrink as you go to deeper layers. An important special case is the \"same\" convolution, in which the height/width is exactly preserved after one layer. \n", + "\n", + "- It helps us keep more of the information at the border of an image. Without padding, very few values at the next layer would be affected by pixels as the edges of an image.\n", + "\n", + "**Exercise**: Implement the following function, which pads all the images of a batch of examples X with zeros. [Use np.pad](https://docs.scipy.org/doc/numpy/reference/generated/numpy.pad.html). Note if you want to pad the array \"a\" of shape $(5,5,5,5,5)$ with `pad = 1` for the 2nd dimension, `pad = 3` for the 4th dimension and `pad = 0` for the rest, you would do:\n", + "```python\n", + "a = np.pad(a, ((0,0), (1,1), (0,0), (3,3), (0,0)), 'constant', constant_values = (..,..))\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# GRADED FUNCTION: zero_pad\n", + "\n", + "def zero_pad(X, pad):\n", + " \"\"\"\n", + " Pad with zeros all images of the dataset X. The padding is applied to the height and width of an image, \n", + " as illustrated in Figure 1.\n", + " \n", + " Argument:\n", + " X -- python numpy array of shape (m, n_H, n_W, n_C) representing a batch of m images\n", + " pad -- integer, amount of padding around each image on vertical and horizontal dimensions\n", + " \n", + " Returns:\n", + " X_pad -- padded image of shape (m, n_H + 2*pad, n_W + 2*pad, n_C)\n", + " \"\"\"\n", + " \n", + " ### START CODE HERE ### (≈ 1 line)\n", + " X_pad = np.pad(X, [(0,0),(pad,pad),(pad,pad),(0,0)], 'constant', constant_values=0)\n", + " ### END CODE HERE ###\n", + " \n", + " return X_pad" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "x.shape = (4, 3, 3, 2)\n", + "x_pad.shape = (4, 7, 7, 2)\n", + "x[1,1] = [[ 0.90085595 -0.68372786]\n", + " [-0.12289023 -0.93576943]\n", + " [-0.26788808 0.53035547]]\n", + "x_pad[1,1] = [[0. 0.]\n", + " [0. 0.]\n", + " [0. 0.]\n", + " [0. 0.]\n", + " [0. 0.]\n", + " [0. 0.]\n", + " [0. 0.]]\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAADHCAYAAADxqlPLAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAEi5JREFUeJzt3X2QXXV9x/H3hyQGYYnYJEpMAkGJjKgVYoowdBjKQycgA86UdqBVQWUydUSx2lGxM0idqaX9w6rFgYmBACUD2kBrikGKw5NM5SFAeAgBGxlotoFJAgrEB2Dh0z/uCb3Z3Oxu9py95949n9fMTu6553fP73v3nvnk7Dnn/n6yTURENMtedRcQERHdl/CPiGighH9ERAMl/CMiGijhHxHRQAn/iIgGSvhHxKQl6RxJd9VdRy9K+EdENFDCPyKigRL+fUzSuyQ9L2lRsfwOSdskHVdzaRHA+PZRSbdL+ntJ90p6QdIPJf1e2/p/lfRsse5OSe9tWzdT0mpJL0q6F3jXRL6/fpbw72O2fwF8GVgpaR9gBXCl7dtrLSyiUGIf/TjwSeAdwBDwnbZ1NwELgbcBDwAr29Z9F/gdMKd4/SfLv4vJSRnbp/9JWg0cDBj4A9sv11xSxE72ZB+VdDtwt+2vFMuHAeuAN9t+bVjb/YFfAvsD22kF//ttP16s/wZwrO0/rPxN9bkc+U8O3wPeB/xzgj961J7uo5vaHj8NTANmSZoi6WJJv5D0IvBU0WYWMBuY2uG10UHCv89JGgC+BVwOXNR+bjSiF4xzH53f9vhA4FVgG/DnwOnAicBbgAU7ugG20jpFNPy10UHCv/99G7jf9rnAj4DLaq4nYrjx7KMflXRYcZ3g68Cq4pTPfsDLwHPAPsA3drygWH8Drf9g9ilOF51d7VuZPBL+fUzS6cAS4C+Lp74ALJL0F/VVFfH/Suyj/wJcCTwL7A18rnj+alqncv4XeAy4e9jrzgMGitddSesCc3SQC74R0VOKC77X2F5edy2TWY78IyIaaGqZFxcXbr5P66LLU8Cf2f5lh3avAY8Ui/9j+7Qy/UZEf5O0fTerTu5qIQ1W6rSPpH8Enrd9saSvAG+1/eUO7bbbHihRZ0REVKhs+D8BHGf7GUlzgNttH9qhXcI/IqKHlD3n/3bbzwAU/75tN+32lrRW0t2SPlKyz4iIKGnUc/6SfgIc0GHV3+xBPwfa3izpncCtkh4pxvwY3tdSYCnAvvvu+8F3v/vde9BF73rwwQfrLqEyBx10UN0lVObpp5/eZnt2t/udNm2ap0+f3u1uoyFefvllXn31VY3WriunfYa95krgRturRmq3aNEi33HHHeOurZfMmDGj7hIqs3z55Ln77txzz73f9uJu9zswMODDDz+8291GQ6xbt47t27ePGv5lT/us5v+/QXc28MPhDSS9VdL04vEs4BhaX86IiIialA3/i4GTJP03cFKxjKTFknYcIr4HWCvpIeA24GLbCf+IiBqVus/f9nPACR2eXwucWzz+L+D9ZfqJiIhq5Ru+ERENlPCPiGighH9ESZKWSHpC0sbim+4RPS/hH1GCpCm05o09GTgMOKsYRz6ipyX8I8o5Etho+0nbrwDX0ZppKqKnJfwjypnLznPGDhbP7UTS0mKIk7VDQ0NdKy5idxL+EeV0+iblLl+bt73M9mLbi6dOLXWHdUQlEv4R5Qyy84Th84DNNdUSMWYJ/4hy7gMWSjpY0puAM2kNexLR0/L3Z0QJtocknQfcDEwBrrC9vuayIkaV8I8oyfYaYE3ddUTsiZz2iYhooIR/REQDJfwjIhoo4R8R0UAJ/4iIBkr4R0Q0UCXhP9qQtpKmS/p+sf4eSQuq6DciIsandPiPcUjbTwG/tH0I8E/AP5TtNyIixq+KI/+xDGl7OnBV8XgVcIKkTgNiRUREF1QR/mMZ0vaNNraHgBeAmcM31D7s7bZt2yooLSIiOqki/McypO0eD3s7a9asCkqLiIhOqgj/sQxp+0YbSVOBtwDPV9B3RESMQxXhP5YhbVcDZxePzwButb3LkX9ERHRH6fAvzuHvGNJ2A/AD2+slfV3SaUWzy4GZkjYCXwB2uR00ol9JukLSFkmP1l1LxFhVMqRzpyFtbV/Y9vh3wJ9W0VdED7oSuAS4uuY6IsYs3/CNKMn2neQaVvSZhH9EF7Tfxjw0NFR3OREJ/4huaL+NeerUTKAX9Uv4R0Q0UMI/IqKBEv4RJUm6FvgZcKikQUmfqrumiNHk5GNESbbPqruGiD2VI/+IiAZK+EdENFDCPyKigRL+ERENlPCPiGig3O0TESO66aabKt/mjBkzKt8mwPLlyydkuytWrJiQ7dYpR/4REQ2U8I+IaKCEf0REA1US/pKWSHpC0kZJu8zSJekcSVslrSt+zq2i34iIGJ/SF3wlTQG+C5xEa6L2+ySttv3YsKbft31e2f4iIqK8Ko78jwQ22n7S9ivAdcDpFWw3IiImSBW3es4FNrUtDwIf6tDuTyQdC/wc+Cvbm4Y3kLQUWApw4IEHst9++1VQXv3OPvvsukuozIknnlh3CRFRgSqO/NXhOQ9b/g9gge3fB34CXNVpQ+2zHc2ePbuC0iImlqT5km6TtEHSeknn111TxFhUEf6DwPy25XnA5vYGtp+z/XKx+D3ggxX0G9ELhoAv2n4PcBTwGUmH1VxTxKiqCP/7gIWSDpb0JuBMYHV7A0lz2hZPAzZU0G9E7Ww/Y/uB4vFLtPbtufVWFTG60uf8bQ9JOg+4GZgCXGF7vaSvA2ttrwY+J+k0WkdJzwPnlO03otdIWgAcAdzTYd0b17OmT5/e1boiOqlkbB/ba4A1w567sO3xBcAFVfQV0YskDQDXA5+3/eLw9baXAcsABgYGhl8Ti+i6fMM3oiRJ02gF/0rbN9RdT8RYJPwjSpAk4HJgg+1v1l1PxFgl/CPKOQb4GHB82/Alp9RdVMRoMp5/RAm276Lzd10ielqO/CMiGijhHxHRQAn/iIgGSvhHRDRQwj8iooFyt09EjGgihlafqGHOJ2rI8RUrVkzIduuUI/+IiAZK+EdENFDCPyKigRL+ERENlPCPiGighH9ERANVEv6SrpC0RdKju1kvSd+RtFHSw5IWVdFvRC+QtLekeyU9VEzi/rd11xQxmqqO/K8Eloyw/mRgYfGzFLi0on4jesHLwPG2PwAcDiyRdFTNNUWMqJLwt30nrbl5d+d04Gq33A3sP2xS94i+VezX24vFacVPpmqMntatc/5zgU1ty4PFcxGTgqQpktYBW4BbbO8yiXtEL+lW+Hea7GKXIyNJSyWtlbR269atXSgrohq2X7N9ODAPOFLS+9rXt+/bQ0ND9RQZ0aZb4T8IzG9bngdsHt7I9jLbi20vnj17dpdKi6iO7V8BtzPsGlj7vj11aobUivp1K/xXAx8v7vo5CnjB9jNd6jtiQkmaLWn/4vGbgROBx+utKmJklRyCSLoWOA6YJWkQ+Bqti17YvgxYA5wCbAR+A3yiin4jesQc4CpJU2gdUP3A9o011xQxokrC3/ZZo6w38Jkq+oroNbYfBo6ou46IPZFv+EZENFDCPyKigRL+ERENlPCPiGighH9ERAPl2yYRMaIDDjig8m1ec801lW8TYMmSkcaXHL+ZM2dOyHbrlCP/iIgGSvhHRDRQwj8iooES/hERDZTwj4hooIR/REQDJfwjIhoo4R9RgWIaxwclZSjn6AsJ/4hqnA9sqLuIiLFK+EeUJGke8GFged21RIxVwj+ivG8BXwJe312DTOAevaaS8Jd0haQtkh7dzfrjJL0gaV3xc2EV/UbUTdKpwBbb94/ULhO4R6+pai+8ErgEuHqENj+1fWpF/UX0imOA0ySdAuwNzJB0je2P1lxXxIgqOfK3fSfwfBXbiugnti+wPc/2AuBM4NYEf/SDbv79ebSkh4DNwF/bXj+8gaSlwFKAvfbaa0KGkq3DRA1fW4eJGjI3IrqrW+H/AHCQ7e3Fn8f/Diwc3sj2MmAZwLRp09yl2iIqYft24Paay4gYk67c7WP7Rdvbi8drgGmSZnWj74iI2FVXwl/SAZJUPD6y6Pe5bvQdERG7quS0j6RrgeOAWZIGga8B0wBsXwacAXxa0hDwW+BM2zmtExFRk0rC3/ZZo6y/hNatoBER0QPyDd+IiAbKVw0jYkSHHHJI5du86KKLKt8mwMyZMydku5NRjvwjIhoo4R8R0UAJ/4iIBkr4R0Q0UMI/IqKBEv4REQ2U8I+IaKDc5x9RAUlPAS8BrwFDthfXW1HEyBL+EdX5I9vb6i4iYixy2iciooES/hHVMPCfku4vZqTbiaSlktZKWjs0NFRDeRE7y2mfiGocY3uzpLcBt0h6vJjbGth5lrqBgYEMZx61y5F/RAVsby7+3QL8G3BkvRVFjCzhH1GSpH0l7bfjMfDHwKP1VhUxstLhL2m+pNskbZC0XtL5HdpI0nckbZT0sKRFZfuN6CFvB+6S9BBwL/Aj2z+uuaaIEVVxzn8I+KLtB4qjn/sl3WL7sbY2JwMLi58PAZcW/0b0PdtPAh+ou46IPVH6yN/2M7YfKB6/BGwA5g5rdjpwtVvuBvaXNKds3xERMT6VnvOXtAA4Arhn2Kq5wKa25UF2/Q9ip9vhXn/99SpLi4iINpWFv6QB4Hrg87ZfHL66w0t2ud3N9jLbi20v3muvXIuOiJgolSSspGm0gn+l7Rs6NBkE5rctzwM2V9F3RETsuSru9hFwObDB9jd302w18PHirp+jgBdsP1O274iIGJ8q7vY5BvgY8IikdcVzXwUOBLB9GbAGOAXYCPwG+EQF/UZExDiVDn/bd9H5nH57GwOfKdtXRERUI1dVIyIaKOEfEdFACf+IiAZK+EdENFDCPyKigRL+ERENlPCPKEnS/pJWSXq8GNr86LprihhNpnGMKO/bwI9tnyHpTcA+dRcUMZqEf0QJkmYAxwLnANh+BXilzpoixiKnfSLKeSewFVgh6UFJy4upHHfSPlz50NBQ96uMGCbhH1HOVGARcKntI4BfA18Z3qh9uPKpU/MHd9Qv4R9RziAwaHvHBEaraP1nENHTEv4RJdh+Ftgk6dDiqROAx0Z4SURPyN+fEeV9FlhZ3OnzJBmyPPpAwj+iJNvrgMV11xGxJ3LaJyKigaqYxnG+pNuKbzaul3R+hzbHSXpB0rri58Ky/UZExPhVcdpnCPii7Qck7QfcL+kW28Mvev3U9qkV9BcRESWVPvK3/YztB4rHLwEbgLlltxsREROn0nP+khYARwD3dFh9tKSHJN0k6b1V9hsREXtGrbnVK9iQNADcAfyd7RuGrZsBvG57u6RTgG/bXthhG0uBpcXiocATlRQ3slnAti700w2T5b10630cZHt2F/rZiaStwNNjbN5Pn2k/1Qr9Ve+e1Dqm/bqS8Jc0DbgRuNn2N8fQ/ilgse3af/GS1tqeFLfpTZb3MlneRxX66XfRT7VCf9U7EbVWcbePgMuBDbsLfkkHFO2QdGTR73Nl+46IiPGp4m6fY4CPAY9IWlc891XgQADblwFnAJ+WNAT8FjjTVZ1vioiIPVY6/G3fBWiUNpcAl5Tta4Isq7uACk2W9zJZ3kcV+ul30U+1Qn/VW3mtlV3wjYiI/pHhHSIiGqix4S9piaQnJG2UtMvkG/1C0hWStkh6tO5ayhrLUCFN0U/7Zz9+bpKmFDOv3Vh3LaORtL+kVZIeL37HR1ey3Sae9pE0Bfg5cBKtyTjuA87qMCRFz5N0LLAduNr2++qupwxJc4A57UOFAB/px8+ljH7bP/vxc5P0BVojsc7o9WFnJF1Fa3ic5cWw4fvY/lXZ7Tb1yP9IYKPtJ4sJt68DTq+5pnGxfSfwfN11VCFDhbyhr/bPfvvcJM0DPgwsr7uW0RRfkD2W1u302H6liuCH5ob/XGBT2/IgPbyzNtEoQ4VMdn27f/bJ5/Yt4EvA63UXMgbvBLYCK4rTVMsl7VvFhpsa/p1uTW3e+a8eVQwVcj3wedsv1l1PDfpy/+yHz03SqcAW2/fXXcsYTaU1J/Slto8Afg1Ucg2oqeE/CMxvW54HbK6plmhTDBVyPbBy+BhRDdJ3+2cffW7HAKcVQ8xcBxwv6Zp6SxrRIDBoe8dfUqto/WdQWlPD/z5goaSDiwsoZwKra66p8cYyVEhD9NX+2U+fm+0LbM+zvYDW7/VW2x+tuazdsv0ssEnSocVTJwCVXEhvZPjbHgLOA26mdXHqB7bX11vV+Ei6FvgZcKikQUmfqrumEnYMFXJ826xvp9RdVLf14f6Zz21ifRZYKelh4HDgG1VstJG3ekZENF0jj/wjIpou4R8R0UAJ/4iIBkr4R0Q0UMI/IqKBEv4REQ2U8I+IaKCEf0REA/0fZbvVf+dwy5sAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "np.random.seed(1)\n", + "x = np.random.randn(4, 3, 3, 2)\n", + "x_pad = zero_pad(x, 2)\n", + "print (\"x.shape =\", x.shape)\n", + "print (\"x_pad.shape =\", x_pad.shape)\n", + "print (\"x[1,1] =\", x[1,1])\n", + "print (\"x_pad[1,1] =\", x_pad[1,1])\n", + "\n", + "fig, axarr = plt.subplots(1, 2)\n", + "axarr[0].set_title('x')\n", + "axarr[0].imshow(x[0,:,:,0])\n", + "axarr[1].set_title('x_pad')\n", + "axarr[1].imshow(x_pad[0,:,:,0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output**:\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + " **x.shape**:\n", + " \n", + " (4, 3, 3, 2)\n", + "
\n", + " **x_pad.shape**:\n", + " \n", + " (4, 7, 7, 2)\n", + "
\n", + " **x[1,1]**:\n", + " \n", + " [[ 0.90085595 -0.68372786]\n", + " [-0.12289023 -0.93576943]\n", + " [-0.26788808 0.53035547]]\n", + "
\n", + " **x_pad[1,1]**:\n", + " \n", + " [[ 0. 0.]\n", + " [ 0. 0.]\n", + " [ 0. 0.]\n", + " [ 0. 0.]\n", + " [ 0. 0.]\n", + " [ 0. 0.]\n", + " [ 0. 0.]]\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.2 - Single step of convolution \n", + "\n", + "In this part, implement a single step of convolution, in which you apply the filter to a single position of the input. This will be used to build a convolutional unit, which: \n", + "\n", + "- Takes an input volume \n", + "- Applies a filter at every position of the input\n", + "- Outputs another volume (usually of different size)\n", + "\n", + "\n", + "
**Figure 2** : **Convolution operation**
with a filter of 2x2 and a stride of 1 (stride = amount you move the window each time you slide)
\n", + "\n", + "In a computer vision application, each value in the matrix on the left corresponds to a single pixel value, and we convolve a 3x3 filter with the image by multiplying its values element-wise with the original matrix, then summing them up and adding a bias. In this first step of the exercise, you will implement a single step of convolution, corresponding to applying a filter to just one of the positions to get a single real-valued output. \n", + "\n", + "Later in this notebook, you'll apply this function to multiple positions of the input to implement the full convolutional operation. \n", + "\n", + "**Exercise**: Implement conv_single_step(). [Hint](https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.sum.html).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# GRADED FUNCTION: conv_single_step\n", + "\n", + "def conv_single_step(a_slice_prev, W, b):\n", + " \"\"\"\n", + " Apply one filter defined by parameters W on a single slice (a_slice_prev) of the output activation \n", + " of the previous layer.\n", + " \n", + " Arguments:\n", + " a_slice_prev -- slice of input data of shape (f, f, n_C_prev)\n", + " W -- Weight parameters contained in a window - matrix of shape (f, f, n_C_prev)\n", + " b -- Bias parameters contained in a window - matrix of shape (1, 1, 1)\n", + " \n", + " Returns:\n", + " Z -- a scalar value, result of convolving the sliding window (W, b) on a slice x of the input data\n", + " \"\"\"\n", + "\n", + " ### START CODE HERE ### (≈ 2 lines of code)\n", + " # Element-wise product between a_slice and W. Do not add the bias yet.\n", + " s = np.multiply(a_slice_prev, W)\n", + " # Sum over all entries of the volume s.\n", + " Z = np.sum(s)\n", + " # Add bias b to Z. Cast b to a float() so that Z results in a scalar value.\n", + " Z = Z + b\n", + " ### END CODE HERE ###\n", + "\n", + " return Z" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Z = [[[-6.99908945]]]\n" + ] + } + ], + "source": [ + "np.random.seed(1)\n", + "a_slice_prev = np.random.randn(4, 4, 3)\n", + "W = np.random.randn(4, 4, 3)\n", + "b = np.random.randn(1, 1, 1)\n", + "\n", + "Z = conv_single_step(a_slice_prev, W, b)\n", + "print(\"Z =\", Z)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output**:\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + " **Z**\n", + " \n", + " -6.99908945068\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "### 3.3 - Convolutional Neural Networks - Forward pass\n", + "\n", + "In the forward pass, you will take many filters and convolve them on the input. Each 'convolution' gives you a 2D matrix output. You will then stack these outputs to get a 3D volume: \n", + "\n", + "
\n", + "\n", + "
\n", + "\n", + "**Exercise**: Implement the function below to convolve the filters W on an input activation A_prev. This function takes as input A_prev, the activations output by the previous layer (for a batch of m inputs), F filters/weights denoted by W, and a bias vector denoted by b, where each filter has its own (single) bias. Finally you also have access to the hyperparameters dictionary which contains the stride and the padding. \n", + "\n", + "**Hint**: \n", + "1. To select a 2x2 slice at the upper left corner of a matrix \"a_prev\" (shape (5,5,3)), you would do:\n", + "```python\n", + "a_slice_prev = a_prev[0:2,0:2,:]\n", + "```\n", + "This will be useful when you will define `a_slice_prev` below, using the `start/end` indexes you will define.\n", + "2. To define a_slice you will need to first define its corners `vert_start`, `vert_end`, `horiz_start` and `horiz_end`. This figure may be helpful for you to find how each of the corner can be defined using h, w, f and s in the code below.\n", + "\n", + "\n", + "
**Figure 3** : **Definition of a slice using vertical and horizontal start/end (with a 2x2 filter)**
This figure shows only a single channel.
\n", + "\n", + "\n", + "**Reminder**:\n", + "The formulas relating the output shape of the convolution to the input shape is:\n", + "$$ n_H = \\lfloor \\frac{n_{H_{prev}} - f + 2 \\times pad}{stride} \\rfloor +1 $$\n", + "$$ n_W = \\lfloor \\frac{n_{W_{prev}} - f + 2 \\times pad}{stride} \\rfloor +1 $$\n", + "$$ n_C = \\text{number of filters used in the convolution}$$\n", + "\n", + "For this exercise, we won't worry about vectorization, and will just implement everything with for-loops." + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "# GRADED FUNCTION: conv_forward\n", + "\n", + "def conv_forward(A_prev, W, b, hparameters):\n", + " \"\"\"\n", + " Implements the forward propagation for a convolution function\n", + " \n", + " Arguments:\n", + " A_prev -- output activations of the previous layer, numpy array of shape (m, n_H_prev, n_W_prev, n_C_prev)\n", + " W -- Weights, numpy array of shape (f, f, n_C_prev, n_C)\n", + " b -- Biases, numpy array of shape (1, 1, 1, n_C)\n", + " hparameters -- python dictionary containing \"stride\" and \"pad\"\n", + " \n", + " Returns:\n", + " Z -- conv output, numpy array of shape (m, n_H, n_W, n_C)\n", + " cache -- cache of values needed for the conv_backward() function\n", + " \"\"\"\n", + " \n", + " ### START CODE HERE ###\n", + " # Retrieve dimensions from A_prev's shape (≈1 line) \n", + " (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape\n", + " \n", + " # Retrieve dimensions from W's shape (≈1 line)\n", + " (f, f, n_C_prev, n_C) = W.shape\n", + " \n", + " # Retrieve information from \"hparameters\" (≈2 lines)\n", + " stride = hparameters['stride']\n", + " pad = hparameters['pad']\n", + " \n", + " # Compute the dimensions of the CONV output volume using the formula given above. Hint: use int() to floor. (≈2 lines)\n", + " n_H = np.int((n_H_prev-f+2*pad)/stride + 1)\n", + " n_W = np.int((n_W_prev-f+2*pad)/stride + 1)\n", + " \n", + " # Initialize the output volume Z with zeros. (≈1 line)\n", + " Z = np.zeros((m,n_H,n_W,n_C ))\n", + " \n", + " # Create A_prev_pad by padding A_prev\n", + " A_prev_pad = zero_pad(A_prev, pad)\n", + " for i in range(m): # loop over the batch of training examples\n", + " a_prev_pad = A_prev_pad[i] # Select ith training example's padded activation\n", + " for h in range(n_H): # loop over vertical axis of the output volume\n", + " for w in range(n_W): # loop over horizontal axis of the output volume\n", + " for c in range(n_C): # loop over channels (= #filters) of the output volume\n", + " \n", + " # Find the corners of the current \"slice\" (≈4 lines)\n", + " vert_start = h*stride\n", + " vert_end = vert_start+f\n", + " horiz_start = w*stride\n", + " horiz_end = horiz_start+f\n", + " \n", + " # Use the corners to define the (3D) slice of a_prev_pad (See Hint above the cell). (≈1 line)\n", + " a_slice_prev = a_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :]\n", + " \n", + " # Convolve the (3D) slice with the correct filter W and bias b, to get back one output neuron. (≈1 line)\n", + " Z[i, h, w, c] = conv_single_step(a_slice_prev, W[:,:,:,c], b[:,:,:,c])\n", + " \n", + " ### END CODE HERE ###\n", + " \n", + " # Making sure your output shape is correct\n", + " assert(Z.shape == (m, n_H, n_W, n_C))\n", + " \n", + " # Save information in \"cache\" for the backprop\n", + " cache = (A_prev, W, b, hparameters)\n", + " \n", + " return Z, cache" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Z's mean = 0.048995203528855794\n", + "Z[3,2,1] = [-0.61490741 -6.7439236 -2.55153897 1.75698377 3.56208902 0.53036437\n", + " 5.18531798 8.75898442]\n", + "cache_conv[0][1][2][3] = [-0.20075807 0.18656139 0.41005165]\n" + ] + } + ], + "source": [ + "np.random.seed(1)\n", + "A_prev = np.random.randn(10,4,4,3)\n", + "W = np.random.randn(2,2,3,8)\n", + "b = np.random.randn(1,1,1,8)\n", + "hparameters = {\"pad\" : 2,\n", + " \"stride\": 2}\n", + "\n", + "Z, cache_conv = conv_forward(A_prev, W, b, hparameters)\n", + "print(\"Z's mean =\", np.mean(Z))\n", + "print(\"Z[3,2,1] =\", Z[3,2,1])\n", + "print(\"cache_conv[0][1][2][3] =\", cache_conv[0][1][2][3])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output**:\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + " **Z's mean**\n", + " \n", + " 0.0489952035289\n", + "
\n", + " **Z[3,2,1]**\n", + " \n", + " [-0.61490741 -6.7439236 -2.55153897 1.75698377 3.56208902 0.53036437\n", + " 5.18531798 8.75898442]\n", + "
\n", + " **cache_conv[0][1][2][3]**\n", + " \n", + " [-0.20075807 0.18656139 0.41005165]\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, CONV layer should also contain an activation, in which case we would add the following line of code:\n", + "\n", + "```python\n", + "# Convolve the window to get back one output neuron\n", + "Z[i, h, w, c] = ...\n", + "# Apply activation\n", + "A[i, h, w, c] = activation(Z[i, h, w, c])\n", + "```\n", + "\n", + "You don't need to do it here. \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4 - Pooling layer \n", + "\n", + "The pooling (POOL) layer reduces the height and width of the input. It helps reduce computation, as well as helps make feature detectors more invariant to its position in the input. The two types of pooling layers are: \n", + "\n", + "- Max-pooling layer: slides an ($f, f$) window over the input and stores the max value of the window in the output.\n", + "\n", + "- Average-pooling layer: slides an ($f, f$) window over the input and stores the average value of the window in the output.\n", + "\n", + "\n", + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "\n", + "These pooling layers have no parameters for backpropagation to train. However, they have hyperparameters such as the window size $f$. This specifies the height and width of the fxf window you would compute a max or average over. \n", + "\n", + "### 4.1 - Forward Pooling\n", + "Now, you are going to implement MAX-POOL and AVG-POOL, in the same function. \n", + "\n", + "**Exercise**: Implement the forward pass of the pooling layer. Follow the hints in the comments below.\n", + "\n", + "**Reminder**:\n", + "As there's no padding, the formulas binding the output shape of the pooling to the input shape is:\n", + "$$ n_H = \\lfloor \\frac{n_{H_{prev}} - f}{stride} \\rfloor +1 $$\n", + "$$ n_W = \\lfloor \\frac{n_{W_{prev}} - f}{stride} \\rfloor +1 $$\n", + "$$ n_C = n_{C_{prev}}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "# GRADED FUNCTION: pool_forward\n", + "\n", + "def pool_forward(A_prev, hparameters, mode = \"max\"):\n", + " \"\"\"\n", + " Implements the forward pass of the pooling layer\n", + " \n", + " Arguments:\n", + " A_prev -- Input data, numpy array of shape (m, n_H_prev, n_W_prev, n_C_prev)\n", + " hparameters -- python dictionary containing \"f\" and \"stride\"\n", + " mode -- the pooling mode you would like to use, defined as a string (\"max\" or \"average\")\n", + " \n", + " Returns:\n", + " A -- output of the pool layer, a numpy array of shape (m, n_H, n_W, n_C)\n", + " cache -- cache used in the backward pass of the pooling layer, contains the input and hparameters \n", + " \"\"\"\n", + " \n", + " # Retrieve dimensions from the input shape\n", + " (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape\n", + " \n", + " # Retrieve hyperparameters from \"hparameters\"\n", + " f = hparameters[\"f\"]\n", + " stride = hparameters[\"stride\"]\n", + " \n", + " # Define the dimensions of the output\n", + " n_H = int(1 + (n_H_prev - f) / stride)\n", + " n_W = int(1 + (n_W_prev - f) / stride)\n", + " n_C = n_C_prev\n", + " \n", + " # Initialize output matrix A\n", + " A = np.zeros((m, n_H, n_W, n_C)) \n", + " \n", + " ### START CODE HERE ###\n", + " for i in range(m): # loop over the training examples\n", + " for h in range(n_H): # loop on the vertical axis of the output volume\n", + " for w in range(n_W): # loop on the horizontal axis of the output volume\n", + " for c in range (n_C): # loop over the channels of the output volume\n", + " \n", + " # Find the corners of the current \"slice\" (≈4 lines)\n", + " vert_start = h*stride\n", + " vert_end = vert_start+f\n", + " horiz_start = w*stride\n", + " horiz_end = horiz_start+f\n", + " \n", + " # Use the corners to define the current slice on the ith training example of A_prev, channel c. (≈1 line)\n", + " a_prev_slice = A_prev[i,vert_start:vert_end,horiz_start:horiz_end,c ]\n", + " \n", + " # Compute the pooling operation on the slice. Use an if statment to differentiate the modes. Use np.max/np.mean.\n", + " if mode == \"max\":\n", + " A[i, h, w, c] = np.max(a_prev_slice)\n", + " elif mode == \"average\":\n", + " A[i, h, w, c] = np.mean(a_prev_slice)\n", + " \n", + " ### END CODE HERE ###\n", + " \n", + " # Store the input and hparameters in \"cache\" for pool_backward()\n", + " cache = (A_prev, hparameters)\n", + " \n", + " # Making sure your output shape is correct\n", + " assert(A.shape == (m, n_H, n_W, n_C))\n", + " \n", + " return A, cache" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mode = max\n", + "A = [[[[1.74481176 0.86540763 1.13376944]]]\n", + "\n", + "\n", + " [[[1.13162939 1.51981682 2.18557541]]]]\n", + "\n", + "mode = average\n", + "A = [[[[ 0.02105773 -0.20328806 -0.40389855]]]\n", + "\n", + "\n", + " [[[-0.22154621 0.51716526 0.48155844]]]]\n" + ] + } + ], + "source": [ + "np.random.seed(1)\n", + "A_prev = np.random.randn(2, 4, 4, 3)\n", + "hparameters = {\"stride\" : 2, \"f\": 3}\n", + "\n", + "A, cache = pool_forward(A_prev, hparameters)\n", + "print(\"mode = max\")\n", + "print(\"A =\", A)\n", + "print()\n", + "A, cache = pool_forward(A_prev, hparameters, mode = \"average\")\n", + "print(\"mode = average\")\n", + "print(\"A =\", A)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output:**\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " A =\n", + " \n", + " [[[[ 1.74481176 0.86540763 1.13376944]]]\n", + " [[[ 1.13162939 1.51981682 2.18557541]]]]\n", + "
\n", + " A =\n", + " \n", + " [[[[ 0.02105773 -0.20328806 -0.40389855]]]\n", + " [[[-0.22154621 0.51716526 0.48155844]]]]\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Congratulations! You have now implemented the forward passes of all the layers of a convolutional network. \n", + "\n", + "The remainer of this notebook is optional, and will not be graded.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5 - Backpropagation in convolutional neural networks (OPTIONAL / UNGRADED)\n", + "\n", + "In modern deep learning frameworks, you only have to implement the forward pass, and the framework takes care of the backward pass, so most deep learning engineers don't need to bother with the details of the backward pass. The backward pass for convolutional networks is complicated. If you wish however, you can work through this optional portion of the notebook to get a sense of what backprop in a convolutional network looks like. \n", + "\n", + "When in an earlier course you implemented a simple (fully connected) neural network, you used backpropagation to compute the derivatives with respect to the cost to update the parameters. Similarly, in convolutional neural networks you can to calculate the derivatives with respect to the cost in order to update the parameters. The backprop equations are not trivial and we did not derive them in lecture, but we briefly presented them below.\n", + "\n", + "### 5.1 - Convolutional layer backward pass \n", + "\n", + "Let's start by implementing the backward pass for a CONV layer. \n", + "\n", + "#### 5.1.1 - Computing dA:\n", + "This is the formula for computing $dA$ with respect to the cost for a certain filter $W_c$ and a given training example:\n", + "\n", + "$$ dA += \\sum _{h=0} ^{n_H} \\sum_{w=0} ^{n_W} W_c \\times dZ_{hw} \\tag{1}$$\n", + "\n", + "Where $W_c$ is a filter and $dZ_{hw}$ is a scalar corresponding to the gradient of the cost with respect to the output of the conv layer Z at the hth row and wth column (corresponding to the dot product taken at the ith stride left and jth stride down). Note that at each time, we multiply the the same filter $W_c$ by a different dZ when updating dA. We do so mainly because when computing the forward propagation, each filter is dotted and summed by a different a_slice. Therefore when computing the backprop for dA, we are just adding the gradients of all the a_slices. \n", + "\n", + "In code, inside the appropriate for-loops, this formula translates into:\n", + "```python\n", + "da_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :] += W[:,:,:,c] * dZ[i, h, w, c]\n", + "```\n", + "\n", + "#### 5.1.2 - Computing dW:\n", + "This is the formula for computing $dW_c$ ($dW_c$ is the derivative of one filter) with respect to the loss:\n", + "\n", + "$$ dW_c += \\sum _{h=0} ^{n_H} \\sum_{w=0} ^ {n_W} a_{slice} \\times dZ_{hw} \\tag{2}$$\n", + "\n", + "Where $a_{slice}$ corresponds to the slice which was used to generate the acitivation $Z_{ij}$. Hence, this ends up giving us the gradient for $W$ with respect to that slice. Since it is the same $W$, we will just add up all such gradients to get $dW$. \n", + "\n", + "In code, inside the appropriate for-loops, this formula translates into:\n", + "```python\n", + "dW[:,:,:,c] += a_slice * dZ[i, h, w, c]\n", + "```\n", + "\n", + "#### 5.1.3 - Computing db:\n", + "\n", + "This is the formula for computing $db$ with respect to the cost for a certain filter $W_c$:\n", + "\n", + "$$ db = \\sum_h \\sum_w dZ_{hw} \\tag{3}$$\n", + "\n", + "As you have previously seen in basic neural networks, db is computed by summing $dZ$. In this case, you are just summing over all the gradients of the conv output (Z) with respect to the cost. \n", + "\n", + "In code, inside the appropriate for-loops, this formula translates into:\n", + "```python\n", + "db[:,:,:,c] += dZ[i, h, w, c]\n", + "```\n", + "\n", + "**Exercise**: Implement the `conv_backward` function below. You should sum over all the training examples, filters, heights, and widths. You should then compute the derivatives using formulas 1, 2 and 3 above. " + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "def conv_backward(dZ, cache):\n", + " \"\"\"\n", + " Implement the backward propagation for a convolution function\n", + " \n", + " Arguments:\n", + " dZ -- gradient of the cost with respect to the output of the conv layer (Z), numpy array of shape (m, n_H, n_W, n_C)\n", + " cache -- cache of values needed for the conv_backward(), output of conv_forward()\n", + " \n", + " Returns:\n", + " dA_prev -- gradient of the cost with respect to the input of the conv layer (A_prev),\n", + " numpy array of shape (m, n_H_prev, n_W_prev, n_C_prev)\n", + " dW -- gradient of the cost with respect to the weights of the conv layer (W)\n", + " numpy array of shape (f, f, n_C_prev, n_C)\n", + " db -- gradient of the cost with respect to the biases of the conv layer (b)\n", + " numpy array of shape (1, 1, 1, n_C)\n", + " \"\"\"\n", + " \n", + " ### START CODE HERE ###\n", + " # Retrieve information from \"cache\"\n", + " (A_prev, W, b, hparameters) = cache\n", + " \n", + " # Retrieve dimensions from A_prev's shape\n", + " (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape\n", + " \n", + " # Retrieve dimensions from W's shape\n", + " (f, f, n_C_prev, n_C) = W.shape\n", + " \n", + " # Retrieve information from \"hparameters\"\n", + " stride = hparameters['stride']\n", + " pad = hparameters['pad']\n", + " \n", + " # Retrieve dimensions from dZ's shape\n", + " (m, n_H, n_W, n_C) = dZ.shape\n", + " \n", + " # Initialize dA_prev, dW, db with the correct shapes\n", + " dA_prev = np.zeros(A_prev.shape) \n", + " dW = np.zeros(W.shape)\n", + " db = np.zeros(b.shape)\n", + "\n", + " # Pad A_prev and dA_prev\n", + " A_prev_pad = zero_pad(A_prev, pad)\n", + " dA_prev_pad = zero_pad(dA_prev, pad)\n", + " \n", + " for i in range(m): # loop over the training examples\n", + " \n", + " # select ith training example from A_prev_pad and dA_prev_pad\n", + " a_prev_pad = A_prev_pad[i]\n", + " da_prev_pad = dA_prev_pad[i]\n", + " \n", + " for h in range(n_H): # loop over vertical axis of the output volume\n", + " for w in range(n_W): # loop over horizontal axis of the output volume\n", + " for c in range(n_C): # loop over the channels of the output volume\n", + " \n", + " # Find the corners of the current \"slice\"\n", + " vert_start = h*stride\n", + " vert_end = vert_start+f\n", + " horiz_start = w*stride\n", + " horiz_end = horiz_start+f\n", + " \n", + " # Use the corners to define the slice from a_prev_pad\n", + " a_slice = a_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :]\n", + "\n", + " # Update gradients for the window and the filter's parameters using the code formulas given above\n", + " da_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :] += W[:,:,:,c]*dZ[i,h,w,c]\n", + " dW[:,:,:,c] += a_slice*dZ[i,h,w,c]\n", + " db[:,:,:,c] += dZ[i,h,w,c]\n", + " \n", + " # Set the ith training example's dA_prev to the unpaded da_prev_pad (Hint: use X[pad:-pad, pad:-pad, :])\n", + " dA_prev[i, :, :, :] = da_prev_pad[pad:-pad, pad:-pad, :]\n", + " ### END CODE HERE ###\n", + " \n", + " # Making sure your output shape is correct\n", + " assert(dA_prev.shape == (m, n_H_prev, n_W_prev, n_C_prev))\n", + " \n", + " return dA_prev, dW, db" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dA_mean = 1.4524377775388075\n", + "dW_mean = 1.7269914583139097\n", + "db_mean = 7.839232564616838\n" + ] + } + ], + "source": [ + "np.random.seed(1)\n", + "dA, dW, db = conv_backward(Z, cache_conv)\n", + "print(\"dA_mean =\", np.mean(dA))\n", + "print(\"dW_mean =\", np.mean(dW))\n", + "print(\"db_mean =\", np.mean(db))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Expected Output: **\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + " **dA_mean**\n", + " \n", + " 1.45243777754\n", + "
\n", + " **dW_mean**\n", + " \n", + " 1.72699145831\n", + "
\n", + " **db_mean**\n", + " \n", + " 7.83923256462\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5.2 Pooling layer - backward pass\n", + "\n", + "Next, let's implement the backward pass for the pooling layer, starting with the MAX-POOL layer. Even though a pooling layer has no parameters for backprop to update, you still need to backpropagation the gradient through the pooling layer in order to compute gradients for layers that came before the pooling layer. \n", + "\n", + "### 5.2.1 Max pooling - backward pass \n", + "\n", + "Before jumping into the backpropagation of the pooling layer, you are going to build a helper function called `create_mask_from_window()` which does the following: \n", + "\n", + "$$ X = \\begin{bmatrix}\n", + "1 && 3 \\\\\n", + "4 && 2\n", + "\\end{bmatrix} \\quad \\rightarrow \\quad M =\\begin{bmatrix}\n", + "0 && 0 \\\\\n", + "1 && 0\n", + "\\end{bmatrix}\\tag{4}$$\n", + "\n", + "As you can see, this function creates a \"mask\" matrix which keeps track of where the maximum of the matrix is. True (1) indicates the position of the maximum in X, the other entries are False (0). You'll see later that the backward pass for average pooling will be similar to this but using a different mask. \n", + "\n", + "**Exercise**: Implement `create_mask_from_window()`. This function will be helpful for pooling backward. \n", + "Hints:\n", + "- [np.max()]() may be helpful. It computes the maximum of an array.\n", + "- If you have a matrix X and a scalar x: `A = (X == x)` will return a matrix A of the same size as X such that:\n", + "```\n", + "A[i,j] = True if X[i,j] = x\n", + "A[i,j] = False if X[i,j] != x\n", + "```\n", + "- Here, you don't need to consider cases where there are several maxima in a matrix." + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "def create_mask_from_window(x):\n", + " \"\"\"\n", + " Creates a mask from an input matrix x, to identify the max entry of x.\n", + " \n", + " Arguments:\n", + " x -- Array of shape (f, f)\n", + " \n", + " Returns:\n", + " mask -- Array of the same shape as window, contains a True at the position corresponding to the max entry of x.\n", + " \"\"\"\n", + " \n", + " ### START CODE HERE ### (≈1 line)\n", + " mask = (x==np.max(x))\n", + " ### END CODE HERE ###\n", + " \n", + " return mask" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "x = [[ 1.62434536 -0.61175641 -0.52817175]\n", + " [-1.07296862 0.86540763 -2.3015387 ]]\n", + "mask = [[ True False False]\n", + " [False False False]]\n" + ] + } + ], + "source": [ + "np.random.seed(1)\n", + "x = np.random.randn(2,3)\n", + "mask = create_mask_from_window(x)\n", + "print('x = ', x)\n", + "print(\"mask = \", mask)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "**Expected Output:** \n", + "\n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "\n", + "**x =**\n", + "\n", + "\n", + "[[ 1.62434536 -0.61175641 -0.52817175]
\n", + " [-1.07296862 0.86540763 -2.3015387 ]]\n", + "\n", + "
\n", + "**mask =**\n", + "\n", + "[[ True False False]
\n", + " [False False False]]\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Why do we keep track of the position of the max? It's because this is the input value that ultimately influenced the output, and therefore the cost. Backprop is computing gradients with respect to the cost, so anything that influences the ultimate cost should have a non-zero gradient. So, backprop will \"propagate\" the gradient back to this particular input value that had influenced the cost. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.2.2 - Average pooling - backward pass \n", + "\n", + "In max pooling, for each input window, all the \"influence\" on the output came from a single input value--the max. In average pooling, every element of the input window has equal influence on the output. So to implement backprop, you will now implement a helper function that reflects this.\n", + "\n", + "For example if we did average pooling in the forward pass using a 2x2 filter, then the mask you'll use for the backward pass will look like: \n", + "$$ dZ = 1 \\quad \\rightarrow \\quad dZ =\\begin{bmatrix}\n", + "1/4 && 1/4 \\\\\n", + "1/4 && 1/4\n", + "\\end{bmatrix}\\tag{5}$$\n", + "\n", + "This implies that each position in the $dZ$ matrix contributes equally to output because in the forward pass, we took an average. \n", + "\n", + "**Exercise**: Implement the function below to equally distribute a value dz through a matrix of dimension shape. [Hint](https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.ones.html)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [], + "source": [ + "def distribute_value(dz, shape):\n", + " \"\"\"\n", + " Distributes the input value in the matrix of dimension shape\n", + " \n", + " Arguments:\n", + " dz -- input scalar\n", + " shape -- the shape (n_H, n_W) of the output matrix for which we want to distribute the value of dz\n", + " \n", + " Returns:\n", + " a -- Array of size (n_H, n_W) for which we distributed the value of dz\n", + " \"\"\"\n", + " \n", + " ### START CODE HERE ###\n", + " # Retrieve dimensions from shape (≈1 line)\n", + " (n_H, n_W) = shape\n", + " \n", + " # Compute the value to distribute on the matrix (≈1 line)\n", + " average = dz/(n_H+n_W)\n", + " \n", + " # Create a matrix where every entry is the \"average\" value (≈1 line)\n", + " a = np.zeros(shape)+average\n", + " ### END CODE HERE ###\n", + " \n", + " return a" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "distributed value = [[0.5 0.5]\n", + " [0.5 0.5]]\n" + ] + } + ], + "source": [ + "a = distribute_value(2, (2,2))\n", + "print('distributed value =', a)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output**: \n", + "\n", + " \n", + " \n", + "\n", + "\n", + "\n", + "
\n", + "distributed_value =\n", + "\n", + "[[ 0.5 0.5]\n", + " \n", + "[ 0.5 0.5]]\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.2.3 Putting it together: Pooling backward \n", + "\n", + "You now have everything you need to compute backward propagation on a pooling layer.\n", + "\n", + "**Exercise**: Implement the `pool_backward` function in both modes (`\"max\"` and `\"average\"`). You will once again use 4 for-loops (iterating over training examples, height, width, and channels). You should use an `if/elif` statement to see if the mode is equal to `'max'` or `'average'`. If it is equal to 'average' you should use the `distribute_value()` function you implemented above to create a matrix of the same shape as `a_slice`. Otherwise, the mode is equal to '`max`', and you will create a mask with `create_mask_from_window()` and multiply it by the corresponding value of dZ." + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [], + "source": [ + "def pool_backward(dA, cache, mode = \"max\"):\n", + " \"\"\"\n", + " Implements the backward pass of the pooling layer\n", + " \n", + " Arguments:\n", + " dA -- gradient of cost with respect to the output of the pooling layer, same shape as A\n", + " cache -- cache output from the forward pass of the pooling layer, contains the layer's input and hparameters \n", + " mode -- the pooling mode you would like to use, defined as a string (\"max\" or \"average\")\n", + " \n", + " Returns:\n", + " dA_prev -- gradient of cost with respect to the input of the pooling layer, same shape as A_prev\n", + " \"\"\"\n", + " \n", + " ### START CODE HERE ###\n", + " \n", + " # Retrieve information from cache (≈1 line)\n", + " (A_prev, hparameters) = cache\n", + " \n", + " # Retrieve hyperparameters from \"hparameters\" (≈2 lines)\n", + " stride = hparameters['stride']\n", + " f = hparameters['f']\n", + " \n", + " # Retrieve dimensions from A_prev's shape and dA's shape (≈2 lines)\n", + " m, n_H_prev, n_W_prev, n_C_prev = A_prev.shape\n", + " m, n_H, n_W, n_C = dA.shape\n", + " print(f'dA_shape {dA.shape}')\n", + " \n", + " # Initialize dA_prev with zeros (≈1 line)\n", + " dA_prev = np.zeros(A_prev.shape)\n", + " \n", + " for i in range(m): # loop over the training examples\n", + " \n", + " # select training example from A_prev (≈1 line)\n", + " a_prev = A_prev[i]\n", + " \n", + " for h in range(n_H): # loop on the vertical axis\n", + " for w in range(n_W): # loop on the horizontal axis\n", + " for c in range(n_C): # loop over the channels (depth)\n", + " \n", + " # Find the corners of the current \"slice\" (≈4 lines)\n", + " vert_start = h*stride\n", + " vert_end = vert_start+f\n", + " horiz_start = w*stride\n", + " horiz_end = horiz_start+f\n", + " \n", + " # Compute the backward propagation in both modes.\n", + " if mode == \"max\":\n", + " \n", + " # Use the corners and \"c\" to define the current slice from a_prev (≈1 line)\n", + " a_prev_slice = a_prev[vert_start:vert_end, horiz_start:horiz_end, c ]\n", + " # Create the mask from a_prev_slice (≈1 line)\n", + " mask = create_mask_from_window(a_prev_slice)\n", + " # Set dA_prev to be dA_prev + (the mask multiplied by the correct entry of dA) (≈1 line)\n", + " dA_prev[i, vert_start: vert_end, horiz_start: horiz_end, c] += dA[i, h, w, c]*mask\n", + " \n", + " elif mode == \"average\":\n", + " \n", + " # Get the value a from dA (≈1 line)\n", + " da = dA[i,h,w, c]\n", + " # Define the shape of the filter as fxf (≈1 line)\n", + " shape = (f,f)\n", + " # Distribute it to get the correct slice of dA_prev. i.e. Add the distributed value of da. (≈1 line)\n", + " dA_prev[i, vert_start: vert_end, horiz_start: horiz_end, c] += distribute_value(da, shape)\n", + " \n", + " ### END CODE ###\n", + " \n", + " # Making sure your output shape is correct\n", + " assert(dA_prev.shape == A_prev.shape)\n", + " \n", + " return dA_prev" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dA_shape (5, 4, 2, 2)\n", + "mode = max\n", + "mean of dA = 0.14571390272918056\n", + "dA_prev[1,1] = [[ 0. 0. ]\n", + " [ 5.05844394 -1.68282702]\n", + " [ 0. 0. ]]\n", + "\n", + "dA_shape (5, 4, 2, 2)\n", + "mode = average\n", + "mean of dA = 0.14571390272918056\n", + "dA_prev[1,1] = [[ 0.08485462 0.2787552 ]\n", + " [ 1.26461098 -0.25749373]\n", + " [ 1.17975636 -0.53624893]]\n" + ] + } + ], + "source": [ + "np.random.seed(1)\n", + "A_prev = np.random.randn(5, 5, 3, 2)\n", + "hparameters = {\"stride\" : 1, \"f\": 2}\n", + "A, cache = pool_forward(A_prev, hparameters)\n", + "dA = np.random.randn(5, 4, 2, 2)\n", + "\n", + "dA_prev = pool_backward(dA, cache, mode = \"max\")\n", + "print(\"mode = max\")\n", + "print('mean of dA = ', np.mean(dA))\n", + "print('dA_prev[1,1] = ', dA_prev[1,1]) \n", + "print()\n", + "dA_prev = pool_backward(dA, cache, mode = \"average\")\n", + "print(\"mode = average\")\n", + "print('mean of dA = ', np.mean(dA))\n", + "print('dA_prev[1,1] = ', dA_prev[1,1]) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output**: \n", + "\n", + "mode = max:\n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + "\n", + "**mean of dA =**\n", + "\n", + "\n", + "0.145713902729\n", + "\n", + "
\n", + "**dA_prev[1,1] =** \n", + "\n", + "[[ 0. 0. ]
\n", + " [ 5.05844394 -1.68282702]
\n", + " [ 0. 0. ]]\n", + "
\n", + "\n", + "mode = average\n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + "\n", + "**mean of dA =**\n", + "\n", + "\n", + "0.145713902729\n", + "\n", + "
\n", + "**dA_prev[1,1] =** \n", + "\n", + "[[ 0.08485462 0.2787552 ]
\n", + " [ 1.26461098 -0.25749373]
\n", + " [ 1.17975636 -0.53624893]]\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Congratulations !\n", + "\n", + "Congratulation on completing this assignment. You now understand how convolutional neural networks work. You have implemented all the building blocks of a neural network. In the next assignment you will implement a ConvNet using TensorFlow." + ] + } + ], + "metadata": { + "coursera": { + "course_slug": "convolutional-neural-networks", + "graded_item_id": "qO8ng", + "launcher_item_id": "7XDi8" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/cnn/cnn_report.pdf b/cnn/cnn_report.pdf new file mode 100644 index 0000000..082457b Binary files /dev/null and b/cnn/cnn_report.pdf differ diff --git a/cnn/cnn_utils.py b/cnn/cnn_utils.py new file mode 100644 index 0000000..4bdf418 --- /dev/null +++ b/cnn/cnn_utils.py @@ -0,0 +1,155 @@ +import math +import numpy as np +import h5py +import matplotlib.pyplot as plt +import tensorflow as tf +from tensorflow.python.framework import ops + +def load_dataset(): + train_dataset = h5py.File('datasets/train_signs.h5', "r") + train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features + train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels + + test_dataset = h5py.File('datasets/test_signs.h5', "r") + test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features + test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels + + classes = np.array(test_dataset["list_classes"][:]) # the list of classes + + train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0])) + test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0])) + + return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes + + +def random_mini_batches(X, Y, mini_batch_size = 64, seed = 0): + """ + Creates a list of random minibatches from (X, Y) + + Arguments: + X -- input data, of shape (input size, number of examples) (m, Hi, Wi, Ci) + Y -- true "label" vector (containing 0 if cat, 1 if non-cat), of shape (1, number of examples) (m, n_y) + mini_batch_size - size of the mini-batches, integer + seed -- this is only for the purpose of grading, so that you're "random minibatches are the same as ours. + + Returns: + mini_batches -- list of synchronous (mini_batch_X, mini_batch_Y) + """ + + m = X.shape[0] # number of training examples + mini_batches = [] + np.random.seed(seed) + + # Step 1: Shuffle (X, Y) + permutation = list(np.random.permutation(m)) + shuffled_X = X[permutation,:,:,:] + shuffled_Y = Y[permutation,:] + + # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case. + num_complete_minibatches = math.floor(m/mini_batch_size) # number of mini batches of size mini_batch_size in your partitionning + for k in range(0, num_complete_minibatches): + mini_batch_X = shuffled_X[k * mini_batch_size : k * mini_batch_size + mini_batch_size,:,:,:] + mini_batch_Y = shuffled_Y[k * mini_batch_size : k * mini_batch_size + mini_batch_size,:] + mini_batch = (mini_batch_X, mini_batch_Y) + mini_batches.append(mini_batch) + + # Handling the end case (last mini-batch < mini_batch_size) + if m % mini_batch_size != 0: + mini_batch_X = shuffled_X[num_complete_minibatches * mini_batch_size : m,:,:,:] + mini_batch_Y = shuffled_Y[num_complete_minibatches * mini_batch_size : m,:] + mini_batch = (mini_batch_X, mini_batch_Y) + mini_batches.append(mini_batch) + + return mini_batches + + +def convert_to_one_hot(Y, C): + Y = np.eye(C)[Y.reshape(-1)].T + return Y + + +def forward_propagation_for_predict(X, parameters): + """ + Implements the forward propagation for the model: LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SOFTMAX + + Arguments: + X -- input dataset placeholder, of shape (input size, number of examples) + parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3" + the shapes are given in initialize_parameters + + Returns: + Z3 -- the output of the last LINEAR unit + """ + + # Retrieve the parameters from the dictionary "parameters" + W1 = parameters['W1'] + b1 = parameters['b1'] + W2 = parameters['W2'] + b2 = parameters['b2'] + W3 = parameters['W3'] + b3 = parameters['b3'] + # Numpy Equivalents: + Z1 = tf.add(tf.matmul(W1, X), b1) # Z1 = np.dot(W1, X) + b1 + A1 = tf.nn.relu(Z1) # A1 = relu(Z1) + Z2 = tf.add(tf.matmul(W2, A1), b2) # Z2 = np.dot(W2, a1) + b2 + A2 = tf.nn.relu(Z2) # A2 = relu(Z2) + Z3 = tf.add(tf.matmul(W3, A2), b3) # Z3 = np.dot(W3,Z2) + b3 + + return Z3 + +def predict(X, parameters): + + W1 = tf.convert_to_tensor(parameters["W1"]) + b1 = tf.convert_to_tensor(parameters["b1"]) + W2 = tf.convert_to_tensor(parameters["W2"]) + b2 = tf.convert_to_tensor(parameters["b2"]) + W3 = tf.convert_to_tensor(parameters["W3"]) + b3 = tf.convert_to_tensor(parameters["b3"]) + + params = {"W1": W1, + "b1": b1, + "W2": W2, + "b2": b2, + "W3": W3, + "b3": b3} + + x = tf.placeholder("float", [12288, 1]) + + z3 = forward_propagation_for_predict(x, params) + p = tf.argmax(z3) + + sess = tf.Session() + prediction = sess.run(p, feed_dict = {x: X}) + + return prediction + +#def predict(X, parameters): +# +# W1 = tf.convert_to_tensor(parameters["W1"]) +# b1 = tf.convert_to_tensor(parameters["b1"]) +# W2 = tf.convert_to_tensor(parameters["W2"]) +# b2 = tf.convert_to_tensor(parameters["b2"]) +## W3 = tf.convert_to_tensor(parameters["W3"]) +## b3 = tf.convert_to_tensor(parameters["b3"]) +# +## params = {"W1": W1, +## "b1": b1, +## "W2": W2, +## "b2": b2, +## "W3": W3, +## "b3": b3} +# +# params = {"W1": W1, +# "b1": b1, +# "W2": W2, +# "b2": b2} +# +# x = tf.placeholder("float", [12288, 1]) +# +# z3 = forward_propagation(x, params) +# p = tf.argmax(z3) +# +# with tf.Session() as sess: +# prediction = sess.run(p, feed_dict = {x: X}) +# +# return prediction \ No newline at end of file diff --git a/cnn/datasets/test_signs.h5 b/cnn/datasets/test_signs.h5 new file mode 100644 index 0000000..ac34131 Binary files /dev/null and b/cnn/datasets/test_signs.h5 differ diff --git a/cnn/datasets/train_signs.h5 b/cnn/datasets/train_signs.h5 new file mode 100644 index 0000000..15904fb Binary files /dev/null and b/cnn/datasets/train_signs.h5 differ diff --git a/cnn/images/Convolution_schematic.gif b/cnn/images/Convolution_schematic.gif new file mode 100644 index 0000000..d8c73dc Binary files /dev/null and b/cnn/images/Convolution_schematic.gif differ diff --git a/cnn/images/PAD.png b/cnn/images/PAD.png new file mode 100644 index 0000000..883ad2d Binary files /dev/null and b/cnn/images/PAD.png differ diff --git a/cnn/images/SIGNS.png b/cnn/images/SIGNS.png new file mode 100644 index 0000000..abba7c4 Binary files /dev/null and b/cnn/images/SIGNS.png differ diff --git a/cnn/images/a_pool.png b/cnn/images/a_pool.png new file mode 100644 index 0000000..93c2f33 Binary files /dev/null and b/cnn/images/a_pool.png differ diff --git a/cnn/images/ave-pool.png b/cnn/images/ave-pool.png new file mode 100644 index 0000000..eb2704e Binary files /dev/null and b/cnn/images/ave-pool.png differ diff --git a/cnn/images/ave_pool1.png b/cnn/images/ave_pool1.png new file mode 100644 index 0000000..5e36230 Binary files /dev/null and b/cnn/images/ave_pool1.png differ diff --git a/cnn/images/average_pool.png b/cnn/images/average_pool.png new file mode 100644 index 0000000..d60c6aa Binary files /dev/null and b/cnn/images/average_pool.png differ diff --git a/cnn/images/conv.png b/cnn/images/conv.png new file mode 100644 index 0000000..0ae8eef Binary files /dev/null and b/cnn/images/conv.png differ diff --git a/cnn/images/conv1.png b/cnn/images/conv1.png new file mode 100644 index 0000000..fd0e1a8 Binary files /dev/null and b/cnn/images/conv1.png differ diff --git a/cnn/images/conv_kiank.mp4 b/cnn/images/conv_kiank.mp4 new file mode 100644 index 0000000..c550f75 Binary files /dev/null and b/cnn/images/conv_kiank.mp4 differ diff --git a/cnn/images/conv_nn.png b/cnn/images/conv_nn.png new file mode 100644 index 0000000..e90a9de Binary files /dev/null and b/cnn/images/conv_nn.png differ diff --git a/cnn/images/max_pool.png b/cnn/images/max_pool.png new file mode 100644 index 0000000..17c82ba Binary files /dev/null and b/cnn/images/max_pool.png differ diff --git a/cnn/images/max_pool1.png b/cnn/images/max_pool1.png new file mode 100644 index 0000000..36acdb5 Binary files /dev/null and b/cnn/images/max_pool1.png differ diff --git a/cnn/images/model.png b/cnn/images/model.png new file mode 100644 index 0000000..84dc475 Binary files /dev/null and b/cnn/images/model.png differ diff --git a/cnn/images/thumbs_up.jpg b/cnn/images/thumbs_up.jpg new file mode 100644 index 0000000..64ab7dc Binary files /dev/null and b/cnn/images/thumbs_up.jpg differ diff --git a/cnn/images/vert_horiz_kiank.png b/cnn/images/vert_horiz_kiank.png new file mode 100644 index 0000000..15e28bb Binary files /dev/null and b/cnn/images/vert_horiz_kiank.png differ diff --git a/cnn/my_signs_tf.ipynb b/cnn/my_signs_tf.ipynb new file mode 100644 index 0000000..02a4994 --- /dev/null +++ b/cnn/my_signs_tf.ipynb @@ -0,0 +1,271 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import tensorflow as tf\n", + "import h5py\n", + "import matplotlib.pyplot as plt\n", + "from tensorflow.python.framework import ops" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "train_data=h5py.File('datasets/train_signs.h5', 'r')\n", + "test_data=h5py.File('datasets/test_signs.h5', 'r')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "x_train=np.array(train_data['train_set_x'])/255\n", + "y_train=np.array(train_data['train_set_y'])\n", + "\n", + "x_test=np.array(test_data['test_set_x'])/255\n", + "y_test=np.array(test_data['test_set_y'])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "classes= np.array(train_data['list_classes'])\n", + "M=np.eye(classes.shape[0])\n", + "def one_hot(Y,classes=classes):\n", + " Y_hot=np.zeros((Y.shape[0], classes.shape[0]))\n", + " for i in range(Y.shape[0]):\n", + " Y_hot[i]=M[Y[i]]\n", + " return Y_hot" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "y_train=one_hot(y_train, classes)\n", + "y_test=one_hot(y_test, classes)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def forward_prop(X,parameters):\n", + " W1=parameters['W1']\n", + " W2=parameters['W2']\n", + " Z1=tf.nn.conv2d(X, filter=W1, strides=[1,1,1,1], padding='SAME')\n", + " A1=tf.nn.relu(Z1)\n", + " A1=tf.nn.max_pool(A1,ksize=[1,8,8,1],strides=[1,8,8,1], padding='SAME')\n", + " Z2=tf.nn.conv2d(A1, filter=W2, strides=[1,1,1,1], padding='SAME')\n", + " A2=tf.nn.relu(Z2)\n", + " A2=tf.nn.max_pool(A2, ksize=[1,4,4,1], strides=[1,4,4,1], padding='SAME')\n", + "\n", + " F=tf.contrib.layers.flatten(A2)\n", + " Z3=tf.contrib.layers.fully_connected(F,6, activation_fn=None)\n", + " return Z3" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "ops.reset_default_graph()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def cost(Z3,Y):\n", + " return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=Z3, labels=Y)) " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "X=tf.placeholder(dtype=tf.float32, shape=[None,64,64,3])\n", + "Y=tf.placeholder(dtype=tf.float32, shape=[None,6])\n", + "\n", + "tf.set_random_seed(1)\n", + "W1=tf.get_variable('W1', shape=[4,4,3,8], initializer=tf.contrib.layers.xavier_initializer(seed=0))\n", + "W2=tf.get_variable('W2', shape=[2,2,8,16], initializer=tf.contrib.layers.xavier_initializer(seed=0))\n", + "parameters={'W1':W1, 'W2':W2}\n", + "\n", + "Z3=forward_prop(X,parameters)\n", + "cost=cost(Z3,Y)\n", + "optimizer=tf.train.AdamOptimizer(learning_rate=0.009).minimize(cost)\n", + "init=tf.global_variables_initializer()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tensor(\"Mean_3:0\", shape=(), dtype=float32)\n", + "Train Accuracy: 0.4074074\n", + "Test Accuracy: 0.40833333\n" + ] + } + ], + "source": [ + "scores=[]\n", + "with tf.Session() as sess:\n", + " sess.run(init)\n", + " for i in range (50):\n", + " _,score=sess.run([optimizer,cost],{X:x_train, Y:y_train})\n", + " #print(f'Score: {score}, iterration: {i}')\n", + " scores.append(score)\n", + " predict_op = tf.argmax(Z3, 1)\n", + " correct_prediction = tf.equal(predict_op, tf.argmax(Y, 1))\n", + " \n", + " # Calculate accuracy on the test set\n", + " accuracy = tf.reduce_mean(tf.cast(correct_prediction, \"float\"))\n", + " print(accuracy)\n", + " train_accuracy = accuracy.eval({X: x_train, Y: y_train})\n", + " test_accuracy = accuracy.eval({X: x_test, Y: y_test})\n", + " print(\"Train Accuracy:\", train_accuracy)\n", + " print(\"Test Accuracy:\", test_accuracy)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(scores)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tensor(\"Mean_2:0\", shape=(), dtype=float32)\n" + ] + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/face_recognition_with_ORL/Face_identification_problem.docx b/face_recognition_with_ORL/Face_identification_problem.docx new file mode 100644 index 0000000..0305198 Binary files /dev/null and b/face_recognition_with_ORL/Face_identification_problem.docx differ diff --git a/optimization_algorithms/Optimization methods.ipynb b/optimization_algorithms/Optimization methods.ipynb new file mode 100644 index 0000000..13aedae --- /dev/null +++ b/optimization_algorithms/Optimization methods.ipynb @@ -0,0 +1,1383 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Optimization Methods\n", + "\n", + "Until now, you've always used Gradient Descent to update the parameters and minimize the cost. In this notebook, you will learn more advanced optimization methods that can speed up learning and perhaps even get you to a better final value for the cost function. Having a good optimization algorithm can be the difference between waiting days vs. just a few hours to get a good result. \n", + "\n", + "Gradient descent goes \"downhill\" on a cost function $J$. Think of it as trying to do this: \n", + "\n", + "
**Figure 1** : **Minimizing the cost is like finding the lowest point in a hilly landscape**
At each step of the training, you update your parameters following a certain direction to try to get to the lowest possible point.
\n", + "\n", + "**Notations**: As usual, $\\frac{\\partial J}{\\partial a } = $ `da` for any variable `a`.\n", + "\n", + "To get started, run the following code to import the libraries you will need." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import scipy.io\n", + "import math\n", + "import sklearn\n", + "import sklearn.datasets\n", + "\n", + "from opt_utils import load_params_and_grads, initialize_parameters, forward_propagation, backward_propagation\n", + "from opt_utils import compute_cost, predict, predict_dec, plot_decision_boundary, load_dataset\n", + "from testCases import *\n", + "\n", + "%matplotlib inline\n", + "plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots\n", + "plt.rcParams['image.interpolation'] = 'nearest'\n", + "plt.rcParams['image.cmap'] = 'gray'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1 - Gradient Descent\n", + "\n", + "A simple optimization method in machine learning is gradient descent (GD). When you take gradient steps with respect to all $m$ examples on each step, it is also called Batch Gradient Descent. \n", + "\n", + "**Warm-up exercise**: Implement the gradient descent update rule. The gradient descent rule is, for $l = 1, ..., L$: \n", + "$$ W^{[l]} = W^{[l]} - \\alpha \\text{ } dW^{[l]} \\tag{1}$$\n", + "$$ b^{[l]} = b^{[l]} - \\alpha \\text{ } db^{[l]} \\tag{2}$$\n", + "\n", + "where L is the number of layers and $\\alpha$ is the learning rate. All parameters should be stored in the `parameters` dictionary. Note that the iterator `l` starts at 0 in the `for` loop while the first parameters are $W^{[1]}$ and $b^{[1]}$. You need to shift `l` to `l+1` when coding." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# GRADED FUNCTION: update_parameters_with_gd\n", + "\n", + "def update_parameters_with_gd(parameters, grads, learning_rate):\n", + " \"\"\"\n", + " Update parameters using one step of gradient descent\n", + " \n", + " Arguments:\n", + " parameters -- python dictionary containing your parameters to be updated:\n", + " parameters['W' + str(l)] = Wl\n", + " parameters['b' + str(l)] = bl\n", + " grads -- python dictionary containing your gradients to update each parameters:\n", + " grads['dW' + str(l)] = dWl\n", + " grads['db' + str(l)] = dbl\n", + " learning_rate -- the learning rate, scalar.\n", + " \n", + " Returns:\n", + " parameters -- python dictionary containing your updated parameters \n", + " \"\"\"\n", + "\n", + " L = len(parameters) // 2 # number of layers in the neural networks\n", + "\n", + " # Update rule for each parameter\n", + " for l in range(L):\n", + " ### START CODE HERE ### (approx. 2 lines)\n", + " parameters[\"W\" + str(l+1)] = None\n", + " parameters[\"b\" + str(l+1)] = None\n", + " ### END CODE HERE ###\n", + " \n", + " return parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "parameters, grads, learning_rate = update_parameters_with_gd_test_case()\n", + "\n", + "parameters = update_parameters_with_gd(parameters, grads, learning_rate)\n", + "print(\"W1 = \" + str(parameters[\"W1\"]))\n", + "print(\"b1 = \" + str(parameters[\"b1\"]))\n", + "print(\"W2 = \" + str(parameters[\"W2\"]))\n", + "print(\"b2 = \" + str(parameters[\"b2\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output**:\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
**W1** [[ 1.63535156 -0.62320365 -0.53718766]\n", + " [-1.07799357 0.85639907 -2.29470142]]
**b1** [[ 1.74604067]\n", + " [-0.75184921]]
**W2** [[ 0.32171798 -0.25467393 1.46902454]\n", + " [-2.05617317 -0.31554548 -0.3756023 ]\n", + " [ 1.1404819 -1.09976462 -0.1612551 ]]
**b2** [[-0.88020257]\n", + " [ 0.02561572]\n", + " [ 0.57539477]]
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A variant of this is Stochastic Gradient Descent (SGD), which is equivalent to mini-batch gradient descent where each mini-batch has just 1 example. The update rule that you have just implemented does not change. What changes is that you would be computing gradients on just one training example at a time, rather than on the whole training set. The code examples below illustrate the difference between stochastic gradient descent and (batch) gradient descent. \n", + "\n", + "- **(Batch) Gradient Descent**:\n", + "\n", + "``` python\n", + "X = data_input\n", + "Y = labels\n", + "parameters = initialize_parameters(layers_dims)\n", + "for i in range(0, num_iterations):\n", + " # Forward propagation\n", + " a, caches = forward_propagation(X, parameters)\n", + " # Compute cost.\n", + " cost = compute_cost(a, Y)\n", + " # Backward propagation.\n", + " grads = backward_propagation(a, caches, parameters)\n", + " # Update parameters.\n", + " parameters = update_parameters(parameters, grads)\n", + " \n", + "```\n", + "\n", + "- **Stochastic Gradient Descent**:\n", + "\n", + "```python\n", + "X = data_input\n", + "Y = labels\n", + "parameters = initialize_parameters(layers_dims)\n", + "for i in range(0, num_iterations):\n", + " for j in range(0, m):\n", + " # Forward propagation\n", + " a, caches = forward_propagation(X[:,j], parameters)\n", + " # Compute cost\n", + " cost = compute_cost(a, Y[:,j])\n", + " # Backward propagation\n", + " grads = backward_propagation(a, caches, parameters)\n", + " # Update parameters.\n", + " parameters = update_parameters(parameters, grads)\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In Stochastic Gradient Descent, you use only 1 training example before updating the gradients. When the training set is large, SGD can be faster. But the parameters will \"oscillate\" toward the minimum rather than converge smoothly. Here is an illustration of this: \n", + "\n", + "\n", + "
**Figure 1** : **SGD vs GD**
\"+\" denotes a minimum of the cost. SGD leads to many oscillations to reach convergence. But each step is a lot faster to compute for SGD than for GD, as it uses only one training example (vs. the whole batch for GD).
\n", + "\n", + "**Note** also that implementing SGD requires 3 for-loops in total:\n", + "1. Over the number of iterations\n", + "2. Over the $m$ training examples\n", + "3. Over the layers (to update all parameters, from $(W^{[1]},b^{[1]})$ to $(W^{[L]},b^{[L]})$)\n", + "\n", + "In practice, you'll often get faster results if you do not use neither the whole training set, nor only one training example, to perform each update. Mini-batch gradient descent uses an intermediate number of examples for each step. With mini-batch gradient descent, you loop over the mini-batches instead of looping over individual training examples.\n", + "\n", + "\n", + "
**Figure 2** : **SGD vs Mini-Batch GD**
\"+\" denotes a minimum of the cost. Using mini-batches in your optimization algorithm often leads to faster optimization.
\n", + "\n", + "\n", + "**What you should remember**:\n", + "- The difference between gradient descent, mini-batch gradient descent and stochastic gradient descent is the number of examples you use to perform one update step.\n", + "- You have to tune a learning rate hyperparameter $\\alpha$.\n", + "- With a well-turned mini-batch size, usually it outperforms either gradient descent or stochastic gradient descent (particularly when the training set is large)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2 - Mini-Batch Gradient descent\n", + "\n", + "Let's learn how to build mini-batches from the training set (X, Y).\n", + "\n", + "There are two steps:\n", + "- **Shuffle**: Create a shuffled version of the training set (X, Y) as shown below. Each column of X and Y represents a training example. Note that the random shuffling is done synchronously between X and Y. Such that after the shuffling the $i^{th}$ column of X is the example corresponding to the $i^{th}$ label in Y. The shuffling step ensures that examples will be split randomly into different mini-batches. \n", + "\n", + "\n", + "\n", + "- **Partition**: Partition the shuffled (X, Y) into mini-batches of size `mini_batch_size` (here 64). Note that the number of training examples is not always divisible by `mini_batch_size`. The last mini batch might be smaller, but you don't need to worry about this. When the final mini-batch is smaller than the full `mini_batch_size`, it will look like this: \n", + "\n", + "\n", + "\n", + "**Exercise**: Implement `random_mini_batches`. We coded the shuffling part for you. To help you with the partitioning step, we give you the following code that selects the indexes for the $1^{st}$ and $2^{nd}$ mini-batches:\n", + "```python\n", + "first_mini_batch_X = shuffled_X[:, 0 : mini_batch_size]\n", + "second_mini_batch_X = shuffled_X[:, mini_batch_size : 2 * mini_batch_size]\n", + "...\n", + "```\n", + "\n", + "Note that the last mini-batch might end up smaller than `mini_batch_size=64`. Let $\\lfloor s \\rfloor$ represents $s$ rounded down to the nearest integer (this is `math.floor(s)` in Python). If the total number of examples is not a multiple of `mini_batch_size=64` then there will be $\\lfloor \\frac{m}{mini\\_batch\\_size}\\rfloor$ mini-batches with a full 64 examples, and the number of examples in the final mini-batch will be ($m-mini_\\_batch_\\_size \\times \\lfloor \\frac{m}{mini\\_batch\\_size}\\rfloor$). " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# GRADED FUNCTION: random_mini_batches\n", + "\n", + "def random_mini_batches(X, Y, mini_batch_size = 64, seed = 0):\n", + " \"\"\"\n", + " Creates a list of random minibatches from (X, Y)\n", + " \n", + " Arguments:\n", + " X -- input data, of shape (input size, number of examples)\n", + " Y -- true \"label\" vector (1 for blue dot / 0 for red dot), of shape (1, number of examples)\n", + " mini_batch_size -- size of the mini-batches, integer\n", + " \n", + " Returns:\n", + " mini_batches -- list of synchronous (mini_batch_X, mini_batch_Y)\n", + " \"\"\"\n", + " \n", + " np.random.seed(seed) # To make your \"random\" minibatches the same as ours\n", + " m = X.shape[1] # number of training examples\n", + " mini_batches = []\n", + " \n", + " # Step 1: Shuffle (X, Y)\n", + " permutation = list(np.random.permutation(m))\n", + " shuffled_X = X[:, permutation]\n", + " shuffled_Y = Y[:, permutation].reshape((1,m))\n", + "\n", + " # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.\n", + " num_complete_minibatches = math.floor(m/mini_batch_size) # number of mini batches of size mini_batch_size in your partitionning\n", + " for k in range(0, num_complete_minibatches):\n", + " ### START CODE HERE ### (approx. 2 lines)\n", + " mini_batch_X = None\n", + " mini_batch_Y = None\n", + " ### END CODE HERE ###\n", + " mini_batch = (mini_batch_X, mini_batch_Y)\n", + " mini_batches.append(mini_batch)\n", + " \n", + " # Handling the end case (last mini-batch < mini_batch_size)\n", + " if m % mini_batch_size != 0:\n", + " ### START CODE HERE ### (approx. 2 lines)\n", + " mini_batch_X = None\n", + " mini_batch_Y = None\n", + " ### END CODE HERE ###\n", + " mini_batch = (mini_batch_X, mini_batch_Y)\n", + " mini_batches.append(mini_batch)\n", + " \n", + " return mini_batches" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "X_assess, Y_assess, mini_batch_size = random_mini_batches_test_case()\n", + "mini_batches = random_mini_batches(X_assess, Y_assess, mini_batch_size)\n", + "\n", + "print (\"shape of the 1st mini_batch_X: \" + str(mini_batches[0][0].shape))\n", + "print (\"shape of the 2nd mini_batch_X: \" + str(mini_batches[1][0].shape))\n", + "print (\"shape of the 3rd mini_batch_X: \" + str(mini_batches[2][0].shape))\n", + "print (\"shape of the 1st mini_batch_Y: \" + str(mini_batches[0][1].shape))\n", + "print (\"shape of the 2nd mini_batch_Y: \" + str(mini_batches[1][1].shape)) \n", + "print (\"shape of the 3rd mini_batch_Y: \" + str(mini_batches[2][1].shape))\n", + "print (\"mini batch sanity check: \" + str(mini_batches[0][0][0][0:3]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output**:\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
**shape of the 1st mini_batch_X** (12288, 64)
**shape of the 2nd mini_batch_X** (12288, 64)
**shape of the 3rd mini_batch_X** (12288, 20)
**shape of the 1st mini_batch_Y** (1, 64)
**shape of the 2nd mini_batch_Y** (1, 64)
**shape of the 3rd mini_batch_Y** (1, 20)
**mini batch sanity check** [ 0.90085595 -0.7612069 0.2344157 ]
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "**What you should remember**:\n", + "- Shuffling and Partitioning are the two steps required to build mini-batches\n", + "- Powers of two are often chosen to be the mini-batch size, e.g., 16, 32, 64, 128." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3 - Momentum\n", + "\n", + "Because mini-batch gradient descent makes a parameter update after seeing just a subset of examples, the direction of the update has some variance, and so the path taken by mini-batch gradient descent will \"oscillate\" toward convergence. Using momentum can reduce these oscillations. \n", + "\n", + "Momentum takes into account the past gradients to smooth out the update. We will store the 'direction' of the previous gradients in the variable $v$. Formally, this will be the exponentially weighted average of the gradient on previous steps. You can also think of $v$ as the \"velocity\" of a ball rolling downhill, building up speed (and momentum) according to the direction of the gradient/slope of the hill. \n", + "\n", + "\n", + "
**Figure 3**: The red arrows shows the direction taken by one step of mini-batch gradient descent with momentum. The blue points show the direction of the gradient (with respect to the current mini-batch) on each step. Rather than just following the gradient, we let the gradient influence $v$ and then take a step in the direction of $v$.
\n", + "\n", + "\n", + "**Exercise**: Initialize the velocity. The velocity, $v$, is a python dictionary that needs to be initialized with arrays of zeros. Its keys are the same as those in the `grads` dictionary, that is:\n", + "for $l =1,...,L$:\n", + "```python\n", + "v[\"dW\" + str(l+1)] = ... #(numpy array of zeros with the same shape as parameters[\"W\" + str(l+1)])\n", + "v[\"db\" + str(l+1)] = ... #(numpy array of zeros with the same shape as parameters[\"b\" + str(l+1)])\n", + "```\n", + "**Note** that the iterator l starts at 0 in the for loop while the first parameters are v[\"dW1\"] and v[\"db1\"] (that's a \"one\" on the superscript). This is why we are shifting l to l+1 in the `for` loop." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# GRADED FUNCTION: initialize_velocity\n", + "\n", + "def initialize_velocity(parameters):\n", + " \"\"\"\n", + " Initializes the velocity as a python dictionary with:\n", + " - keys: \"dW1\", \"db1\", ..., \"dWL\", \"dbL\" \n", + " - values: numpy arrays of zeros of the same shape as the corresponding gradients/parameters.\n", + " Arguments:\n", + " parameters -- python dictionary containing your parameters.\n", + " parameters['W' + str(l)] = Wl\n", + " parameters['b' + str(l)] = bl\n", + " \n", + " Returns:\n", + " v -- python dictionary containing the current velocity.\n", + " v['dW' + str(l)] = velocity of dWl\n", + " v['db' + str(l)] = velocity of dbl\n", + " \"\"\"\n", + " \n", + " L = len(parameters) // 2 # number of layers in the neural networks\n", + " v = {}\n", + " \n", + " # Initialize velocity\n", + " for l in range(L):\n", + " ### START CODE HERE ### (approx. 2 lines)\n", + " v[\"dW\" + str(l+1)] = None\n", + " v[\"db\" + str(l+1)] = None\n", + " ### END CODE HERE ###\n", + " \n", + " return v" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "parameters = initialize_velocity_test_case()\n", + "\n", + "v = initialize_velocity(parameters)\n", + "print(\"v[\\\"dW1\\\"] = \" + str(v[\"dW1\"]))\n", + "print(\"v[\\\"db1\\\"] = \" + str(v[\"db1\"]))\n", + "print(\"v[\\\"dW2\\\"] = \" + str(v[\"dW2\"]))\n", + "print(\"v[\\\"db2\\\"] = \" + str(v[\"db2\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output**:\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
**v[\"dW1\"]** [[ 0. 0. 0.]\n", + " [ 0. 0. 0.]]
**v[\"db1\"]** [[ 0.]\n", + " [ 0.]]
**v[\"dW2\"]** [[ 0. 0. 0.]\n", + " [ 0. 0. 0.]\n", + " [ 0. 0. 0.]]
**v[\"db2\"]** [[ 0.]\n", + " [ 0.]\n", + " [ 0.]]
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Exercise**: Now, implement the parameters update with momentum. The momentum update rule is, for $l = 1, ..., L$: \n", + "\n", + "$$ \\begin{cases}\n", + "v_{dW^{[l]}} = \\beta v_{dW^{[l]}} + (1 - \\beta) dW^{[l]} \\\\\n", + "W^{[l]} = W^{[l]} - \\alpha v_{dW^{[l]}}\n", + "\\end{cases}\\tag{3}$$\n", + "\n", + "$$\\begin{cases}\n", + "v_{db^{[l]}} = \\beta v_{db^{[l]}} + (1 - \\beta) db^{[l]} \\\\\n", + "b^{[l]} = b^{[l]} - \\alpha v_{db^{[l]}} \n", + "\\end{cases}\\tag{4}$$\n", + "\n", + "where L is the number of layers, $\\beta$ is the momentum and $\\alpha$ is the learning rate. All parameters should be stored in the `parameters` dictionary. Note that the iterator `l` starts at 0 in the `for` loop while the first parameters are $W^{[1]}$ and $b^{[1]}$ (that's a \"one\" on the superscript). So you will need to shift `l` to `l+1` when coding." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# GRADED FUNCTION: update_parameters_with_momentum\n", + "\n", + "def update_parameters_with_momentum(parameters, grads, v, beta, learning_rate):\n", + " \"\"\"\n", + " Update parameters using Momentum\n", + " \n", + " Arguments:\n", + " parameters -- python dictionary containing your parameters:\n", + " parameters['W' + str(l)] = Wl\n", + " parameters['b' + str(l)] = bl\n", + " grads -- python dictionary containing your gradients for each parameters:\n", + " grads['dW' + str(l)] = dWl\n", + " grads['db' + str(l)] = dbl\n", + " v -- python dictionary containing the current velocity:\n", + " v['dW' + str(l)] = ...\n", + " v['db' + str(l)] = ...\n", + " beta -- the momentum hyperparameter, scalar\n", + " learning_rate -- the learning rate, scalar\n", + " \n", + " Returns:\n", + " parameters -- python dictionary containing your updated parameters \n", + " v -- python dictionary containing your updated velocities\n", + " \"\"\"\n", + "\n", + " L = len(parameters) // 2 # number of layers in the neural networks\n", + " \n", + " # Momentum update for each parameter\n", + " for l in range(L):\n", + " \n", + " ### START CODE HERE ### (approx. 4 lines)\n", + " # compute velocities\n", + " v[\"dW\" + str(l+1)] = None\n", + " v[\"db\" + str(l+1)] = None\n", + " # update parameters\n", + " parameters[\"W\" + str(l+1)] = None\n", + " parameters[\"b\" + str(l+1)] = None\n", + " ### END CODE HERE ###\n", + " \n", + " return parameters, v" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "parameters, grads, v = update_parameters_with_momentum_test_case()\n", + "\n", + "parameters, v = update_parameters_with_momentum(parameters, grads, v, beta = 0.9, learning_rate = 0.01)\n", + "print(\"W1 = \" + str(parameters[\"W1\"]))\n", + "print(\"b1 = \" + str(parameters[\"b1\"]))\n", + "print(\"W2 = \" + str(parameters[\"W2\"]))\n", + "print(\"b2 = \" + str(parameters[\"b2\"]))\n", + "print(\"v[\\\"dW1\\\"] = \" + str(v[\"dW1\"]))\n", + "print(\"v[\\\"db1\\\"] = \" + str(v[\"db1\"]))\n", + "print(\"v[\\\"dW2\\\"] = \" + str(v[\"dW2\"]))\n", + "print(\"v[\\\"db2\\\"] = \" + str(v[\"db2\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output**:\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
**W1** [[ 1.62544598 -0.61290114 -0.52907334]\n", + " [-1.07347112 0.86450677 -2.30085497]]
**b1** [[ 1.74493465]\n", + " [-0.76027113]]
**W2** [[ 0.31930698 -0.24990073 1.4627996 ]\n", + " [-2.05974396 -0.32173003 -0.38320915]\n", + " [ 1.13444069 -1.0998786 -0.1713109 ]]
**b2** [[-0.87809283]\n", + " [ 0.04055394]\n", + " [ 0.58207317]]
**v[\"dW1\"]** [[-0.11006192 0.11447237 0.09015907]\n", + " [ 0.05024943 0.09008559 -0.06837279]]
**v[\"db1\"]** [[-0.01228902]\n", + " [-0.09357694]]
**v[\"dW2\"]** [[-0.02678881 0.05303555 -0.06916608]\n", + " [-0.03967535 -0.06871727 -0.08452056]\n", + " [-0.06712461 -0.00126646 -0.11173103]]
**v[\"db2\"]** [[ 0.02344157]\n", + " [ 0.16598022]\n", + " [ 0.07420442]]
\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "**Note** that:\n", + "- The velocity is initialized with zeros. So the algorithm will take a few iterations to \"build up\" velocity and start to take bigger steps.\n", + "- If $\\beta = 0$, then this just becomes standard gradient descent without momentum. \n", + "\n", + "**How do you choose $\\beta$?**\n", + "\n", + "- The larger the momentum $\\beta$ is, the smoother the update because the more we take the past gradients into account. But if $\\beta$ is too big, it could also smooth out the updates too much. \n", + "- Common values for $\\beta$ range from 0.8 to 0.999. If you don't feel inclined to tune this, $\\beta = 0.9$ is often a reasonable default. \n", + "- Tuning the optimal $\\beta$ for your model might need trying several values to see what works best in term of reducing the value of the cost function $J$. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "**What you should remember**:\n", + "- Momentum takes past gradients into account to smooth out the steps of gradient descent. It can be applied with batch gradient descent, mini-batch gradient descent or stochastic gradient descent.\n", + "- You have to tune a momentum hyperparameter $\\beta$ and a learning rate $\\alpha$." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4 - Adam\n", + "\n", + "Adam is one of the most effective optimization algorithms for training neural networks. It combines ideas from RMSProp (described in lecture) and Momentum. \n", + "\n", + "**How does Adam work?**\n", + "1. It calculates an exponentially weighted average of past gradients, and stores it in variables $v$ (before bias correction) and $v^{corrected}$ (with bias correction). \n", + "2. It calculates an exponentially weighted average of the squares of the past gradients, and stores it in variables $s$ (before bias correction) and $s^{corrected}$ (with bias correction). \n", + "3. It updates parameters in a direction based on combining information from \"1\" and \"2\".\n", + "\n", + "The update rule is, for $l = 1, ..., L$: \n", + "\n", + "$$\\begin{cases}\n", + "v_{dW^{[l]}} = \\beta_1 v_{dW^{[l]}} + (1 - \\beta_1) \\frac{\\partial \\mathcal{J} }{ \\partial W^{[l]} } \\\\\n", + "v^{corrected}_{dW^{[l]}} = \\frac{v_{dW^{[l]}}}{1 - (\\beta_1)^t} \\\\\n", + "s_{dW^{[l]}} = \\beta_2 s_{dW^{[l]}} + (1 - \\beta_2) (\\frac{\\partial \\mathcal{J} }{\\partial W^{[l]} })^2 \\\\\n", + "s^{corrected}_{dW^{[l]}} = \\frac{s_{dW^{[l]}}}{1 - (\\beta_1)^t} \\\\\n", + "W^{[l]} = W^{[l]} - \\alpha \\frac{v^{corrected}_{dW^{[l]}}}{\\sqrt{s^{corrected}_{dW^{[l]}}} + \\varepsilon}\n", + "\\end{cases}$$\n", + "where:\n", + "- t counts the number of steps taken of Adam \n", + "- L is the number of layers\n", + "- $\\beta_1$ and $\\beta_2$ are hyperparameters that control the two exponentially weighted averages. \n", + "- $\\alpha$ is the learning rate\n", + "- $\\varepsilon$ is a very small number to avoid dividing by zero\n", + "\n", + "As usual, we will store all parameters in the `parameters` dictionary " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Exercise**: Initialize the Adam variables $v, s$ which keep track of the past information.\n", + "\n", + "**Instruction**: The variables $v, s$ are python dictionaries that need to be initialized with arrays of zeros. Their keys are the same as for `grads`, that is:\n", + "for $l = 1, ..., L$:\n", + "```python\n", + "v[\"dW\" + str(l+1)] = ... #(numpy array of zeros with the same shape as parameters[\"W\" + str(l+1)])\n", + "v[\"db\" + str(l+1)] = ... #(numpy array of zeros with the same shape as parameters[\"b\" + str(l+1)])\n", + "s[\"dW\" + str(l+1)] = ... #(numpy array of zeros with the same shape as parameters[\"W\" + str(l+1)])\n", + "s[\"db\" + str(l+1)] = ... #(numpy array of zeros with the same shape as parameters[\"b\" + str(l+1)])\n", + "\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# GRADED FUNCTION: initialize_adam\n", + "\n", + "def initialize_adam(parameters) :\n", + " \"\"\"\n", + " Initializes v and s as two python dictionaries with:\n", + " - keys: \"dW1\", \"db1\", ..., \"dWL\", \"dbL\" \n", + " - values: numpy arrays of zeros of the same shape as the corresponding gradients/parameters.\n", + " \n", + " Arguments:\n", + " parameters -- python dictionary containing your parameters.\n", + " parameters[\"W\" + str(l)] = Wl\n", + " parameters[\"b\" + str(l)] = bl\n", + " \n", + " Returns: \n", + " v -- python dictionary that will contain the exponentially weighted average of the gradient.\n", + " v[\"dW\" + str(l)] = ...\n", + " v[\"db\" + str(l)] = ...\n", + " s -- python dictionary that will contain the exponentially weighted average of the squared gradient.\n", + " s[\"dW\" + str(l)] = ...\n", + " s[\"db\" + str(l)] = ...\n", + "\n", + " \"\"\"\n", + " \n", + " L = len(parameters) // 2 # number of layers in the neural networks\n", + " v = {}\n", + " s = {}\n", + " \n", + " # Initialize v, s. Input: \"parameters\". Outputs: \"v, s\".\n", + " for l in range(L):\n", + " ### START CODE HERE ### (approx. 4 lines)\n", + " v[\"dW\" + str(l+1)] = None\n", + " v[\"db\" + str(l+1)] = None\n", + " s[\"dW\" + str(l+1)] = None\n", + " s[\"db\" + str(l+1)] = None\n", + " ### END CODE HERE ###\n", + " \n", + " return v, s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "parameters = initialize_adam_test_case()\n", + "\n", + "v, s = initialize_adam(parameters)\n", + "print(\"v[\\\"dW1\\\"] = \" + str(v[\"dW1\"]))\n", + "print(\"v[\\\"db1\\\"] = \" + str(v[\"db1\"]))\n", + "print(\"v[\\\"dW2\\\"] = \" + str(v[\"dW2\"]))\n", + "print(\"v[\\\"db2\\\"] = \" + str(v[\"db2\"]))\n", + "print(\"s[\\\"dW1\\\"] = \" + str(s[\"dW1\"]))\n", + "print(\"s[\\\"db1\\\"] = \" + str(s[\"db1\"]))\n", + "print(\"s[\\\"dW2\\\"] = \" + str(s[\"dW2\"]))\n", + "print(\"s[\\\"db2\\\"] = \" + str(s[\"db2\"]))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output**:\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
**v[\"dW1\"]** [[ 0. 0. 0.]\n", + " [ 0. 0. 0.]]
**v[\"db1\"]** [[ 0.]\n", + " [ 0.]]
**v[\"dW2\"]** [[ 0. 0. 0.]\n", + " [ 0. 0. 0.]\n", + " [ 0. 0. 0.]]
**v[\"db2\"]** [[ 0.]\n", + " [ 0.]\n", + " [ 0.]]
**s[\"dW1\"]** [[ 0. 0. 0.]\n", + " [ 0. 0. 0.]]
**s[\"db1\"]** [[ 0.]\n", + " [ 0.]]
**s[\"dW2\"]** [[ 0. 0. 0.]\n", + " [ 0. 0. 0.]\n", + " [ 0. 0. 0.]]
**s[\"db2\"]** [[ 0.]\n", + " [ 0.]\n", + " [ 0.]]
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Exercise**: Now, implement the parameters update with Adam. Recall the general update rule is, for $l = 1, ..., L$: \n", + "\n", + "$$\\begin{cases}\n", + "v_{W^{[l]}} = \\beta_1 v_{W^{[l]}} + (1 - \\beta_1) \\frac{\\partial J }{ \\partial W^{[l]} } \\\\\n", + "v^{corrected}_{W^{[l]}} = \\frac{v_{W^{[l]}}}{1 - (\\beta_1)^t} \\\\\n", + "s_{W^{[l]}} = \\beta_2 s_{W^{[l]}} + (1 - \\beta_2) (\\frac{\\partial J }{\\partial W^{[l]} })^2 \\\\\n", + "s^{corrected}_{W^{[l]}} = \\frac{s_{W^{[l]}}}{1 - (\\beta_2)^t} \\\\\n", + "W^{[l]} = W^{[l]} - \\alpha \\frac{v^{corrected}_{W^{[l]}}}{\\sqrt{s^{corrected}_{W^{[l]}}}+\\varepsilon}\n", + "\\end{cases}$$\n", + "\n", + "\n", + "**Note** that the iterator `l` starts at 0 in the `for` loop while the first parameters are $W^{[1]}$ and $b^{[1]}$. You need to shift `l` to `l+1` when coding." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# GRADED FUNCTION: update_parameters_with_adam\n", + "\n", + "def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate = 0.01,\n", + " beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8):\n", + " \"\"\"\n", + " Update parameters using Adam\n", + " \n", + " Arguments:\n", + " parameters -- python dictionary containing your parameters:\n", + " parameters['W' + str(l)] = Wl\n", + " parameters['b' + str(l)] = bl\n", + " grads -- python dictionary containing your gradients for each parameters:\n", + " grads['dW' + str(l)] = dWl\n", + " grads['db' + str(l)] = dbl\n", + " v -- Adam variable, moving average of the first gradient, python dictionary\n", + " s -- Adam variable, moving average of the squared gradient, python dictionary\n", + " learning_rate -- the learning rate, scalar.\n", + " beta1 -- Exponential decay hyperparameter for the first moment estimates \n", + " beta2 -- Exponential decay hyperparameter for the second moment estimates \n", + " epsilon -- hyperparameter preventing division by zero in Adam updates\n", + "\n", + " Returns:\n", + " parameters -- python dictionary containing your updated parameters \n", + " v -- Adam variable, moving average of the first gradient, python dictionary\n", + " s -- Adam variable, moving average of the squared gradient, python dictionary\n", + " \"\"\"\n", + " \n", + " L = len(parameters) // 2 # number of layers in the neural networks\n", + " v_corrected = {} # Initializing first moment estimate, python dictionary\n", + " s_corrected = {} # Initializing second moment estimate, python dictionary\n", + " \n", + " # Perform Adam update on all parameters\n", + " for l in range(L):\n", + " # Moving average of the gradients. Inputs: \"v, grads, beta1\". Output: \"v\".\n", + " ### START CODE HERE ### (approx. 2 lines)\n", + " v[\"dW\" + str(l+1)] = None\n", + " v[\"db\" + str(l+1)] = None\n", + " ### END CODE HERE ###\n", + "\n", + " # Compute bias-corrected first moment estimate. Inputs: \"v, beta1, t\". Output: \"v_corrected\".\n", + " ### START CODE HERE ### (approx. 2 lines)\n", + " v_corrected[\"dW\" + str(l+1)] = None\n", + " v_corrected[\"db\" + str(l+1)] = None\n", + " ### END CODE HERE ###\n", + "\n", + " # Moving average of the squared gradients. Inputs: \"s, grads, beta2\". Output: \"s\".\n", + " ### START CODE HERE ### (approx. 2 lines)\n", + " s[\"dW\" + str(l+1)] = None\n", + " s[\"db\" + str(l+1)] = None\n", + " ### END CODE HERE ###\n", + "\n", + " # Compute bias-corrected second raw moment estimate. Inputs: \"s, beta2, t\". Output: \"s_corrected\".\n", + " ### START CODE HERE ### (approx. 2 lines)\n", + " s_corrected[\"dW\" + str(l+1)] = None\n", + " s_corrected[\"db\" + str(l+1)] = None\n", + " ### END CODE HERE ###\n", + "\n", + " # Update parameters. Inputs: \"parameters, learning_rate, v_corrected, s_corrected, epsilon\". Output: \"parameters\".\n", + " ### START CODE HERE ### (approx. 2 lines)\n", + " parameters[\"W\" + str(l+1)] = None\n", + " parameters[\"b\" + str(l+1)] = None\n", + " ### END CODE HERE ###\n", + "\n", + " return parameters, v, s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": false + }, + "outputs": [], + "source": [ + "parameters, grads, v, s = update_parameters_with_adam_test_case()\n", + "parameters, v, s = update_parameters_with_adam(parameters, grads, v, s, t = 2)\n", + "\n", + "print(\"W1 = \" + str(parameters[\"W1\"]))\n", + "print(\"b1 = \" + str(parameters[\"b1\"]))\n", + "print(\"W2 = \" + str(parameters[\"W2\"]))\n", + "print(\"b2 = \" + str(parameters[\"b2\"]))\n", + "print(\"v[\\\"dW1\\\"] = \" + str(v[\"dW1\"]))\n", + "print(\"v[\\\"db1\\\"] = \" + str(v[\"db1\"]))\n", + "print(\"v[\\\"dW2\\\"] = \" + str(v[\"dW2\"]))\n", + "print(\"v[\\\"db2\\\"] = \" + str(v[\"db2\"]))\n", + "print(\"s[\\\"dW1\\\"] = \" + str(s[\"dW1\"]))\n", + "print(\"s[\\\"db1\\\"] = \" + str(s[\"db1\"]))\n", + "print(\"s[\\\"dW2\\\"] = \" + str(s[\"dW2\"]))\n", + "print(\"s[\\\"db2\\\"] = \" + str(s[\"db2\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output**:\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
**W1** [[ 1.63178673 -0.61919778 -0.53561312]\n", + " [-1.08040999 0.85796626 -2.29409733]]
**b1** [[ 1.75225313]\n", + " [-0.75376553]]
**W2** [[ 0.32648046 -0.25681174 1.46954931]\n", + " [-2.05269934 -0.31497584 -0.37661299]\n", + " [ 1.14121081 -1.09245036 -0.16498684]]
**b2** [[-0.88529978]\n", + " [ 0.03477238]\n", + " [ 0.57537385]]
**v[\"dW1\"]** [[-0.11006192 0.11447237 0.09015907]\n", + " [ 0.05024943 0.09008559 -0.06837279]]
**v[\"db1\"]** [[-0.01228902]\n", + " [-0.09357694]]
**v[\"dW2\"]** [[-0.02678881 0.05303555 -0.06916608]\n", + " [-0.03967535 -0.06871727 -0.08452056]\n", + " [-0.06712461 -0.00126646 -0.11173103]]
**v[\"db2\"]** [[ 0.02344157]\n", + " [ 0.16598022]\n", + " [ 0.07420442]]
**s[\"dW1\"]** [[ 0.00121136 0.00131039 0.00081287]\n", + " [ 0.0002525 0.00081154 0.00046748]]
**s[\"db1\"]** [[ 1.51020075e-05]\n", + " [ 8.75664434e-04]]
**s[\"dW2\"]** [[ 7.17640232e-05 2.81276921e-04 4.78394595e-04]\n", + " [ 1.57413361e-04 4.72206320e-04 7.14372576e-04]\n", + " [ 4.50571368e-04 1.60392066e-07 1.24838242e-03]]
**s[\"db2\"]** [[ 5.49507194e-05]\n", + " [ 2.75494327e-03]\n", + " [ 5.50629536e-04]]
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You now have three working optimization algorithms (mini-batch gradient descent, Momentum, Adam). Let's implement a model with each of these optimizers and observe the difference." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5 - Model with different optimization algorithms\n", + "\n", + "Lets use the following \"moons\" dataset to test the different optimization methods. (The dataset is named \"moons\" because the data from each of the two classes looks a bit like a crescent-shaped moon.) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "train_X, train_Y = load_dataset()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have already implemented a 3-layer neural network. You will train it with: \n", + "- Mini-batch **Gradient Descent**: it will call your function:\n", + " - `update_parameters_with_gd()`\n", + "- Mini-batch **Momentum**: it will call your functions:\n", + " - `initialize_velocity()` and `update_parameters_with_momentum()`\n", + "- Mini-batch **Adam**: it will call your functions:\n", + " - `initialize_adam()` and `update_parameters_with_adam()`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def model(X, Y, layers_dims, optimizer, learning_rate = 0.0007, mini_batch_size = 64, beta = 0.9,\n", + " beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8, num_epochs = 10000, print_cost = True):\n", + " \"\"\"\n", + " 3-layer neural network model which can be run in different optimizer modes.\n", + " \n", + " Arguments:\n", + " X -- input data, of shape (2, number of examples)\n", + " Y -- true \"label\" vector (1 for blue dot / 0 for red dot), of shape (1, number of examples)\n", + " layers_dims -- python list, containing the size of each layer\n", + " learning_rate -- the learning rate, scalar.\n", + " mini_batch_size -- the size of a mini batch\n", + " beta -- Momentum hyperparameter\n", + " beta1 -- Exponential decay hyperparameter for the past gradients estimates \n", + " beta2 -- Exponential decay hyperparameter for the past squared gradients estimates \n", + " epsilon -- hyperparameter preventing division by zero in Adam updates\n", + " num_epochs -- number of epochs\n", + " print_cost -- True to print the cost every 1000 epochs\n", + "\n", + " Returns:\n", + " parameters -- python dictionary containing your updated parameters \n", + " \"\"\"\n", + "\n", + " L = len(layers_dims) # number of layers in the neural networks\n", + " costs = [] # to keep track of the cost\n", + " t = 0 # initializing the counter required for Adam update\n", + " seed = 10 # For grading purposes, so that your \"random\" minibatches are the same as ours\n", + " \n", + " # Initialize parameters\n", + " parameters = initialize_parameters(layers_dims)\n", + "\n", + " # Initialize the optimizer\n", + " if optimizer == \"gd\":\n", + " pass # no initialization required for gradient descent\n", + " elif optimizer == \"momentum\":\n", + " v = initialize_velocity(parameters)\n", + " elif optimizer == \"adam\":\n", + " v, s = initialize_adam(parameters)\n", + " \n", + " # Optimization loop\n", + " for i in range(num_epochs):\n", + " \n", + " # Define the random minibatches. We increment the seed to reshuffle differently the dataset after each epoch\n", + " seed = seed + 1\n", + " minibatches = random_mini_batches(X, Y, mini_batch_size, seed)\n", + "\n", + " for minibatch in minibatches:\n", + "\n", + " # Select a minibatch\n", + " (minibatch_X, minibatch_Y) = minibatch\n", + "\n", + " # Forward propagation\n", + " a3, caches = forward_propagation(minibatch_X, parameters)\n", + "\n", + " # Compute cost\n", + " cost = compute_cost(a3, minibatch_Y)\n", + "\n", + " # Backward propagation\n", + " grads = backward_propagation(minibatch_X, minibatch_Y, caches)\n", + "\n", + " # Update parameters\n", + " if optimizer == \"gd\":\n", + " parameters = update_parameters_with_gd(parameters, grads, learning_rate)\n", + " elif optimizer == \"momentum\":\n", + " parameters, v = update_parameters_with_momentum(parameters, grads, v, beta, learning_rate)\n", + " elif optimizer == \"adam\":\n", + " t = t + 1 # Adam counter\n", + " parameters, v, s = update_parameters_with_adam(parameters, grads, v, s,\n", + " t, learning_rate, beta1, beta2, epsilon)\n", + " \n", + " # Print the cost every 1000 epoch\n", + " if print_cost and i % 1000 == 0:\n", + " print (\"Cost after epoch %i: %f\" %(i, cost))\n", + " if print_cost and i % 100 == 0:\n", + " costs.append(cost)\n", + " \n", + " # plot the cost\n", + " plt.plot(costs)\n", + " plt.ylabel('cost')\n", + " plt.xlabel('epochs (per 100)')\n", + " plt.title(\"Learning rate = \" + str(learning_rate))\n", + " plt.show()\n", + "\n", + " return parameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You will now run this 3 layer neural network with each of the 3 optimization methods.\n", + "\n", + "### 5.1 - Mini-batch Gradient descent\n", + "\n", + "Run the following code to see how the model does with mini-batch gradient descent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "scrolled": false + }, + "outputs": [], + "source": [ + "# train 3-layer model\n", + "layers_dims = [train_X.shape[0], 5, 2, 1]\n", + "parameters = model(train_X, train_Y, layers_dims, optimizer = \"gd\")\n", + "\n", + "# Predict\n", + "predictions = predict(train_X, train_Y, parameters)\n", + "\n", + "# Plot decision boundary\n", + "plt.title(\"Model with Gradient Descent optimization\")\n", + "axes = plt.gca()\n", + "axes.set_xlim([-1.5,2.5])\n", + "axes.set_ylim([-1,1.5])\n", + "plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "### 5.2 - Mini-batch gradient descent with momentum\n", + "\n", + "Run the following code to see how the model does with momentum. Because this example is relatively simple, the gains from using momemtum are small; but for more complex problems you might see bigger gains." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# train 3-layer model\n", + "layers_dims = [train_X.shape[0], 5, 2, 1]\n", + "parameters = model(train_X, train_Y, layers_dims, beta = 0.9, optimizer = \"momentum\")\n", + "\n", + "# Predict\n", + "predictions = predict(train_X, train_Y, parameters)\n", + "\n", + "# Plot decision boundary\n", + "plt.title(\"Model with Momentum optimization\")\n", + "axes = plt.gca()\n", + "axes.set_xlim([-1.5,2.5])\n", + "axes.set_ylim([-1,1.5])\n", + "plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "### 5.3 - Mini-batch with Adam mode\n", + "\n", + "Run the following code to see how the model does with Adam." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# train 3-layer model\n", + "layers_dims = [train_X.shape[0], 5, 2, 1]\n", + "parameters = model(train_X, train_Y, layers_dims, optimizer = \"adam\")\n", + "\n", + "# Predict\n", + "predictions = predict(train_X, train_Y, parameters)\n", + "\n", + "# Plot decision boundary\n", + "plt.title(\"Model with Adam optimization\")\n", + "axes = plt.gca()\n", + "axes.set_xlim([-1.5,2.5])\n", + "axes.set_ylim([-1,1.5])\n", + "plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "### 5.4 - Summary\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " **optimization method**\n", + " \n", + " **accuracy**\n", + " \n", + " **cost shape**\n", + "
\n", + " Gradient descent\n", + " \n", + " 79.7%\n", + " \n", + " oscillations\n", + "
\n", + " Momentum\n", + " \n", + " 79.7%\n", + " \n", + " oscillations\n", + "
\n", + " Adam\n", + " \n", + " 94%\n", + " \n", + " smoother\n", + "
\n", + "\n", + "Momentum usually helps, but given the small learning rate and the simplistic dataset, its impact is almost negligeable. Also, the huge oscillations you see in the cost come from the fact that some minibatches are more difficult thans others for the optimization algorithm.\n", + "\n", + "Adam on the other hand, clearly outperforms mini-batch gradient descent and Momentum. If you run the model for more epochs on this simple dataset, all three methods will lead to very good results. However, you've seen that Adam converges a lot faster.\n", + "\n", + "Some advantages of Adam include:\n", + "- Relatively low memory requirements (though higher than gradient descent and gradient descent with momentum) \n", + "- Usually works well even with little tuning of hyperparameters (except $\\alpha$)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "**References**:\n", + "\n", + "- Adam paper: https://arxiv.org/pdf/1412.6980.pdf" + ] + } + ], + "metadata": { + "coursera": { + "course_slug": "deep-neural-network", + "graded_item_id": "Ckiv2", + "launcher_item_id": "eNLYh" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/optimization_algorithms/datasets/data.mat b/optimization_algorithms/datasets/data.mat new file mode 100644 index 0000000..a0441ac Binary files /dev/null and b/optimization_algorithms/datasets/data.mat differ diff --git a/optimization_algorithms/images/Momentum.png b/optimization_algorithms/images/Momentum.png new file mode 100644 index 0000000..1052fb5 Binary files /dev/null and b/optimization_algorithms/images/Momentum.png differ diff --git a/optimization_algorithms/images/cost.jpg b/optimization_algorithms/images/cost.jpg new file mode 100644 index 0000000..91adc20 Binary files /dev/null and b/optimization_algorithms/images/cost.jpg differ diff --git a/optimization_algorithms/images/kiank_minibatch.png b/optimization_algorithms/images/kiank_minibatch.png new file mode 100644 index 0000000..382964b Binary files /dev/null and b/optimization_algorithms/images/kiank_minibatch.png differ diff --git a/optimization_algorithms/images/kiank_partition.png b/optimization_algorithms/images/kiank_partition.png new file mode 100644 index 0000000..792089f Binary files /dev/null and b/optimization_algorithms/images/kiank_partition.png differ diff --git a/optimization_algorithms/images/kiank_sgd.png b/optimization_algorithms/images/kiank_sgd.png new file mode 100644 index 0000000..99ab39e Binary files /dev/null and b/optimization_algorithms/images/kiank_sgd.png differ diff --git a/optimization_algorithms/images/kiank_shuffle.png b/optimization_algorithms/images/kiank_shuffle.png new file mode 100644 index 0000000..f01d117 Binary files /dev/null and b/optimization_algorithms/images/kiank_shuffle.png differ diff --git a/optimization_algorithms/images/opt1.gif b/optimization_algorithms/images/opt1.gif new file mode 100644 index 0000000..61db246 Binary files /dev/null and b/optimization_algorithms/images/opt1.gif differ diff --git a/optimization_algorithms/images/opt2.gif b/optimization_algorithms/images/opt2.gif new file mode 100644 index 0000000..e9d54d0 Binary files /dev/null and b/optimization_algorithms/images/opt2.gif differ diff --git a/optimization_algorithms/images/opt_momentum.png b/optimization_algorithms/images/opt_momentum.png new file mode 100644 index 0000000..dae5c87 Binary files /dev/null and b/optimization_algorithms/images/opt_momentum.png differ diff --git a/optimization_algorithms/opt_utils.py b/optimization_algorithms/opt_utils.py new file mode 100644 index 0000000..8440066 --- /dev/null +++ b/optimization_algorithms/opt_utils.py @@ -0,0 +1,260 @@ +import numpy as np +import matplotlib.pyplot as plt +import h5py +import scipy.io +import sklearn +import sklearn.datasets + +def sigmoid(x): + """ + Compute the sigmoid of x + + Arguments: + x -- A scalar or numpy array of any size. + + Return: + s -- sigmoid(x) + """ + s = 1/(1+np.exp(-x)) + return s + +def relu(x): + """ + Compute the relu of x + + Arguments: + x -- A scalar or numpy array of any size. + + Return: + s -- relu(x) + """ + s = np.maximum(0,x) + + return s + +def load_params_and_grads(seed=1): + np.random.seed(seed) + W1 = np.random.randn(2,3) + b1 = np.random.randn(2,1) + W2 = np.random.randn(3,3) + b2 = np.random.randn(3,1) + + dW1 = np.random.randn(2,3) + db1 = np.random.randn(2,1) + dW2 = np.random.randn(3,3) + db2 = np.random.randn(3,1) + + return W1, b1, W2, b2, dW1, db1, dW2, db2 + + +def initialize_parameters(layer_dims): + """ + Arguments: + layer_dims -- python array (list) containing the dimensions of each layer in our network + + Returns: + parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL": + W1 -- weight matrix of shape (layer_dims[l], layer_dims[l-1]) + b1 -- bias vector of shape (layer_dims[l], 1) + Wl -- weight matrix of shape (layer_dims[l-1], layer_dims[l]) + bl -- bias vector of shape (1, layer_dims[l]) + + Tips: + - For example: the layer_dims for the "Planar Data classification model" would have been [2,2,1]. + This means W1's shape was (2,2), b1 was (1,2), W2 was (2,1) and b2 was (1,1). Now you have to generalize it! + - In the for loop, use parameters['W' + str(l)] to access Wl, where l is the iterative integer. + """ + + np.random.seed(3) + parameters = {} + L = len(layer_dims) # number of layers in the network + + for l in range(1, L): + parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])* np.sqrt(2 / layer_dims[l-1]) + parameters['b' + str(l)] = np.zeros((layer_dims[l], 1)) + + assert(parameters['W' + str(l)].shape == layer_dims[l], layer_dims[l-1]) + assert(parameters['W' + str(l)].shape == layer_dims[l], 1) + + return parameters + + +def compute_cost(a3, Y): + + """ + Implement the cost function + + Arguments: + a3 -- post-activation, output of forward propagation + Y -- "true" labels vector, same shape as a3 + + Returns: + cost - value of the cost function + """ + m = Y.shape[1] + + logprobs = np.multiply(-np.log(a3),Y) + np.multiply(-np.log(1 - a3), 1 - Y) + cost = 1./m * np.sum(logprobs) + + return cost + +def forward_propagation(X, parameters): + """ + Implements the forward propagation (and computes the loss) presented in Figure 2. + + Arguments: + X -- input dataset, of shape (input size, number of examples) + parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3": + W1 -- weight matrix of shape () + b1 -- bias vector of shape () + W2 -- weight matrix of shape () + b2 -- bias vector of shape () + W3 -- weight matrix of shape () + b3 -- bias vector of shape () + + Returns: + loss -- the loss function (vanilla logistic loss) + """ + + # retrieve parameters + W1 = parameters["W1"] + b1 = parameters["b1"] + W2 = parameters["W2"] + b2 = parameters["b2"] + W3 = parameters["W3"] + b3 = parameters["b3"] + + # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID + z1 = np.dot(W1, X) + b1 + a1 = relu(z1) + z2 = np.dot(W2, a1) + b2 + a2 = relu(z2) + z3 = np.dot(W3, a2) + b3 + a3 = sigmoid(z3) + + cache = (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3) + + return a3, cache + +def backward_propagation(X, Y, cache): + """ + Implement the backward propagation presented in figure 2. + + Arguments: + X -- input dataset, of shape (input size, number of examples) + Y -- true "label" vector (containing 0 if cat, 1 if non-cat) + cache -- cache output from forward_propagation() + + Returns: + gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables + """ + m = X.shape[1] + (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3) = cache + + dz3 = 1./m * (a3 - Y) + dW3 = np.dot(dz3, a2.T) + db3 = np.sum(dz3, axis=1, keepdims = True) + + da2 = np.dot(W3.T, dz3) + dz2 = np.multiply(da2, np.int64(a2 > 0)) + dW2 = np.dot(dz2, a1.T) + db2 = np.sum(dz2, axis=1, keepdims = True) + + da1 = np.dot(W2.T, dz2) + dz1 = np.multiply(da1, np.int64(a1 > 0)) + dW1 = np.dot(dz1, X.T) + db1 = np.sum(dz1, axis=1, keepdims = True) + + gradients = {"dz3": dz3, "dW3": dW3, "db3": db3, + "da2": da2, "dz2": dz2, "dW2": dW2, "db2": db2, + "da1": da1, "dz1": dz1, "dW1": dW1, "db1": db1} + + return gradients + +def predict(X, y, parameters): + """ + This function is used to predict the results of a n-layer neural network. + + Arguments: + X -- data set of examples you would like to label + parameters -- parameters of the trained model + + Returns: + p -- predictions for the given dataset X + """ + + m = X.shape[1] + p = np.zeros((1,m), dtype = np.int) + + # Forward propagation + a3, caches = forward_propagation(X, parameters) + + # convert probas to 0/1 predictions + for i in range(0, a3.shape[1]): + if a3[0,i] > 0.5: + p[0,i] = 1 + else: + p[0,i] = 0 + + # print results + + #print ("predictions: " + str(p[0,:])) + #print ("true labels: " + str(y[0,:])) + print("Accuracy: " + str(np.mean((p[0,:] == y[0,:])))) + + return p + +def load_2D_dataset(): + data = scipy.io.loadmat('datasets/data.mat') + train_X = data['X'].T + train_Y = data['y'].T + test_X = data['Xval'].T + test_Y = data['yval'].T + + plt.scatter(train_X[0, :], train_X[1, :], c=train_Y, s=40, cmap=plt.cm.Spectral); + + return train_X, train_Y, test_X, test_Y + +def plot_decision_boundary(model, X, y): + # Set min and max values and give it some padding + x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1 + y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1 + h = 0.01 + # Generate a grid of points with distance h between them + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) + # Predict the function value for the whole grid + Z = model(np.c_[xx.ravel(), yy.ravel()]) + Z = Z.reshape(xx.shape) + # Plot the contour and training examples + plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral) + plt.ylabel('x2') + plt.xlabel('x1') + plt.scatter(X[0, :], X[1, :], c=y, cmap=plt.cm.Spectral) + plt.show() + +def predict_dec(parameters, X): + """ + Used for plotting decision boundary. + + Arguments: + parameters -- python dictionary containing your parameters + X -- input data of size (m, K) + + Returns + predictions -- vector of predictions of our model (red: 0 / blue: 1) + """ + + # Predict using forward propagation and a classification threshold of 0.5 + a3, cache = forward_propagation(X, parameters) + predictions = (a3 > 0.5) + return predictions + +def load_dataset(): + np.random.seed(3) + train_X, train_Y = sklearn.datasets.make_moons(n_samples=300, noise=.2) #300 #0.2 + # Visualize the data + plt.scatter(train_X[:, 0], train_X[:, 1], c=train_Y, s=40, cmap=plt.cm.Spectral); + train_X = train_X.T + train_Y = train_Y.reshape((1, train_Y.shape[0])) + + return train_X, train_Y \ No newline at end of file diff --git a/optimization_algorithms/testCases.py b/optimization_algorithms/testCases.py new file mode 100644 index 0000000..46feea8 --- /dev/null +++ b/optimization_algorithms/testCases.py @@ -0,0 +1,105 @@ +import numpy as np + +def update_parameters_with_gd_test_case(): + np.random.seed(1) + learning_rate = 0.01 + W1 = np.random.randn(2,3) + b1 = np.random.randn(2,1) + W2 = np.random.randn(3,3) + b2 = np.random.randn(3,1) + + dW1 = np.random.randn(2,3) + db1 = np.random.randn(2,1) + dW2 = np.random.randn(3,3) + db2 = np.random.randn(3,1) + + parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2} + grads = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2} + + return parameters, grads, learning_rate + +""" +def update_parameters_with_sgd_checker(function, inputs, outputs): + if function(inputs) == outputs: + print("Correct") + else: + print("Incorrect") +""" + +def random_mini_batches_test_case(): + np.random.seed(1) + mini_batch_size = 64 + X = np.random.randn(12288, 148) + Y = np.random.randn(1, 148) < 0.5 + return X, Y, mini_batch_size + +def initialize_velocity_test_case(): + np.random.seed(1) + W1 = np.random.randn(2,3) + b1 = np.random.randn(2,1) + W2 = np.random.randn(3,3) + b2 = np.random.randn(3,1) + parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2} + return parameters + +def update_parameters_with_momentum_test_case(): + np.random.seed(1) + W1 = np.random.randn(2,3) + b1 = np.random.randn(2,1) + W2 = np.random.randn(3,3) + b2 = np.random.randn(3,1) + + dW1 = np.random.randn(2,3) + db1 = np.random.randn(2,1) + dW2 = np.random.randn(3,3) + db2 = np.random.randn(3,1) + parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2} + grads = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2} + v = {'dW1': np.array([[ 0., 0., 0.], + [ 0., 0., 0.]]), 'dW2': np.array([[ 0., 0., 0.], + [ 0., 0., 0.], + [ 0., 0., 0.]]), 'db1': np.array([[ 0.], + [ 0.]]), 'db2': np.array([[ 0.], + [ 0.], + [ 0.]])} + return parameters, grads, v + +def initialize_adam_test_case(): + np.random.seed(1) + W1 = np.random.randn(2,3) + b1 = np.random.randn(2,1) + W2 = np.random.randn(3,3) + b2 = np.random.randn(3,1) + parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2} + return parameters + +def update_parameters_with_adam_test_case(): + np.random.seed(1) + v, s = ({'dW1': np.array([[ 0., 0., 0.], + [ 0., 0., 0.]]), 'dW2': np.array([[ 0., 0., 0.], + [ 0., 0., 0.], + [ 0., 0., 0.]]), 'db1': np.array([[ 0.], + [ 0.]]), 'db2': np.array([[ 0.], + [ 0.], + [ 0.]])}, {'dW1': np.array([[ 0., 0., 0.], + [ 0., 0., 0.]]), 'dW2': np.array([[ 0., 0., 0.], + [ 0., 0., 0.], + [ 0., 0., 0.]]), 'db1': np.array([[ 0.], + [ 0.]]), 'db2': np.array([[ 0.], + [ 0.], + [ 0.]])}) + W1 = np.random.randn(2,3) + b1 = np.random.randn(2,1) + W2 = np.random.randn(3,3) + b2 = np.random.randn(3,1) + + dW1 = np.random.randn(2,3) + db1 = np.random.randn(2,1) + dW2 = np.random.randn(3,3) + db2 = np.random.randn(3,1) + + parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2} + grads = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2} + + return parameters, grads, v, s + \ No newline at end of file