From d52400075034e01cabb001c3b6e9e44374da2d20 Mon Sep 17 00:00:00 2001 From: ThomasAtlantis <1138670081@qq.com> Date: Sat, 28 Dec 2019 01:26:14 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E9=87=8D=E5=A4=A7Bug?= =?UTF-8?q?=EF=BC=8C=E5=B0=86=E5=8D=B7=E7=A7=AF=E5=B1=82=E7=9C=9F=E6=AD=A3?= =?UTF-8?q?=E5=8A=A0=E5=85=A5=E6=A8=A1=E5=9E=8B=EF=BC=8C=E6=9C=80=E9=AB=98?= =?UTF-8?q?=E5=88=86=E6=95=B00.733?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.bak | 11 + CMakeLists.txt | 4 +- README.md | 168 +++++++++++ .../t10k-images.idx3-ubyte | Bin .../t10k-labels.idx1-ubyte | Bin .../train-images.idx3-ubyte | Bin .../train-labels.idx1-ubyte | Bin include/dataio.h | 71 +++++ vector.h => include/vector.h | 0 main.c => src/main.c | 265 +++++++----------- test.c | 10 - 11 files changed, 347 insertions(+), 182 deletions(-) create mode 100644 CMakeLists.bak create mode 100644 README.md rename t10k-images.idx3-ubyte => dataset/t10k-images.idx3-ubyte (100%) rename t10k-labels.idx1-ubyte => dataset/t10k-labels.idx1-ubyte (100%) rename train-images.idx3-ubyte => dataset/train-images.idx3-ubyte (100%) rename train-labels.idx1-ubyte => dataset/train-labels.idx1-ubyte (100%) create mode 100644 include/dataio.h rename vector.h => include/vector.h (100%) rename main.c => src/main.c (56%) delete mode 100644 test.c diff --git a/CMakeLists.bak b/CMakeLists.bak new file mode 100644 index 0000000..c6ebde5 --- /dev/null +++ b/CMakeLists.bak @@ -0,0 +1,11 @@ +cmake_minimum_required(VERSION 3.10) +project(C99_MNIST C) +SET(CMAKE_BUILD_TYPE Release) +set(CMAKE_C_STANDARD 99) +set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/bin) +set(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/lib) +include_directories( + ${PROJECT_SOURCE_DIR}/include +) +add_executable(C99_MNIST src/main.c include/vector.h include/dataio.h) +target_link_libraries(C99_MNIST m) \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index e377433..fb99c16 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,4 @@ project(CNN C) set(CMAKE_C_STANDARD 99) -add_executable(CNN main.c vector.h) - -add_executable(CNN_TEST test.c) \ No newline at end of file +add_executable(CNN src/main.c include/vector.h include/dataio.h) \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..7306d02 --- /dev/null +++ b/README.md @@ -0,0 +1,168 @@ +#### 原始模型 +这个工程主要参考了博客[CNN实现MNIST手写数字识别(C++)](https://blog.csdn.net/qq_37141382/article/details/88088781) 中给出的基于C++和STL的代码。经过改进后的本文对应的各个版本在我的[GitHub仓库](https://github.com/ThomasAtlantis/C99_MNIST)。本文设计的模型用于处理经典的手写数字识别问题,使用很工整的MNIST数据集,只是为了理论验证。之前我在学习CNN的过程中也使用PyTorch搭建过Python版本的,在[这里](https://github.com/ThomasAtlantis/NER/tree/master/learningResources/Case/HandwrittenDigitRecognition)。 + +原始模型结构非常简单,只使用了一个卷积层和一个全连接层。有图有真相: +【占坑】 +#### 研究目的 +由于作者目前关注神经网络模型的边缘化部署,本工程的目的是为将来在FPGA等更底层的边缘设备上实现CNN做铺垫。当然CNN的训练过程在服务器上进行,推断过程在边缘端进行,那么我们的目的就是开发一个同时支持服务器训练和FPGA推断的项目,或者给出一个通用的研究方法。 + +原博主是在学习CNN阶段为了加深理解编写的这份代码,使用了最简单的模型结构,也难免有所疏漏。本文将详细探讨偏底层语言编写CNN的过程,并附有传播公式的详细推导过程。希望我的工作能对其代码进行如下几方面的研究和改进(按照顺序依次进行): ++ 代码一次降级:使用C99标准,不使用STL库 ++ 沿着程序思路探究每个函数的功用,推导前向传播和反向传播的计算公式 ++ 调试代码中存在的Bug,回归测试是否能够提升准确率 ++ 优化代码的可读性、可维护性和可扩展性 ++ 优化代码的运行效率,进行精确的性能测试 ++ 将代码转移到Linux服务器上运行 ++ 修改神经网络结构为经典结构,重新计算公式和复用函数,对其性能和准确率进行评估 ++ 将参数矩阵按照某种标准格式导出,并编写解析参数的函数 ++ 对模型的前向传播过程分别进行针对性和普遍性的二次降级,使之成为适用于FPGA上HLS的代码 ++ 对性能进行二次优化(硬件优化),对卷积等计算密集功能进行模块化测试,最后将模块之间贯穿控制逻辑 + +#### 前向计算 +第一次降级过程我就不说了,对于稍微有代码经验的人都能做到,只要参考仓库的最早的提交就可以。原博主给出了传播过程的公式,却既没有给推导过程,也没有与代码对应起来。推导过程参考了很多资料,主要参考了这篇[宝贝文章](https://blog.csdn.net/qq_16137569/article/details/81449209),讲的很细致了。 + +【占坑】 +#### 反向传播 +反向传播是指【占坑】 +##### 误差敏感项的传播 +误差敏感项是指【占坑】 + +**#1 全连接层的输出层误差敏感项** +神经网络的损失函数$C$为交叉熵损失函数,参考[这篇博客](https://blog.csdn.net/chao_shine/article/details/89925762)。设全连接层的第$i$个节点的直接线性输出为$y_i$,经过`softmax`处理的结果是$y_i'$,`ground truth`在类别$i$处的概率是$t_i$,那么输出层的敏感项$\delta_{i}$的计算过程如下: +$$\begin{aligned} +\because C &=-\Sigma_kt_klny_k',y_i'=\frac{e^{y_i}}{\Sigma_k e^{y_k}} \\ +\therefore \delta_{i} &= \frac{\partial C}{\partial y_i} += -t_i\frac{1}{y_i}\frac{\partial y_i'}{\partial y_i} \\ +&= -t_i\frac{1}{y_i'}\frac{e^{y_i}(\Sigma_k e^{y_k})-e^{y_i}e^{y_i}}{(\Sigma_ke^{y_k})^2} \\ +&=-t_i(1-\frac{e^{y_i}}{\Sigma_ke^{y_k}}) \\ +&=y_i'-t_i +\end{aligned}$$ +其实由于手写数字识别是一个单目标的多分类问题,所以$t_i$的值,对于$i$若与标签相同,$t_i$概率值为1,否则为0。 + +**#2 输出池化层/全连接输入层的误差敏感项(未经激活)** +设$z_j$为全连接层输入的第$j$项(未经激活),计算过程: +$$\begin{aligned} +\delta_j&=\frac{\partial C}{\partial z_j}=\sum_{i \in DS(j)}\frac{\partial C}{\partial y_i}\frac{\partial y_i}{\partial z_j} \\ +&=\sum_{i \in DS(j)}\delta_i \frac{\partial(\Sigma_kw_{ik}\sigma(z_k)+b_i)}{\partial z_j} \\ +&=\sum_{i \in DS(j)}\delta_i \frac{d(\sigma(z_j))}{dz_j}w_{ij} +\end{aligned}$$ +其中$DS(j)$的意思是$Downstream(j)$,这个词在一些描述神经网络的文章中也很常见,意思是与节点$j$相连的所有下一层节点组成的集合。这里激活函数使用的是`Sigmoid`函数:$\sigma(x)=1/(1+e^{-x})$,设其倒数为$s(x)$,则其倒数计算过程如下: +$$\begin{aligned} +\because \frac{ds(x)}{dx} &=-e^{-x}=1-s(x) \\ +\therefore \frac{d\sigma(x)}{dx} &= -\frac{s'(x)}{s^2(x)}=\frac{s(x)-1}{s^2(x)} \\ +&= (\frac{1}{\sigma(x)}-1)\sigma^2(x) \\ +&= (1-\sigma(x))\sigma (x) +\end{aligned}$$ +设$a_j$为全连接层输入的第j个节点的值(经过激活之后)。将激活函数的导数公式代入敏感项公式中,得到: +$$\begin{aligned} +\delta_j = a_j(1-a_j)\sum_{i \in DS(j)}\delta_iw_{ij} +\end{aligned}$$ +**#3 输入池化层的误差敏感项(经过ReLU)** +池化层使用的是`MaxPooling`,所以下一层的敏感项的值会原封不动的传递到上一层最大值所对应的神经元,而其他神经元的敏感项的值都是0,即不会在反向传播的过程中进行更新。设$\delta_k$为池化层输入的第$k$个节点$a_k$的敏感项,$\delta_j$是池化层输出的第$j$个节点$z_j$的敏感项。这里为了简化表示,池化层下标使用了一维表示。 +$$ \delta_k^{in}=\left\{ +\begin{aligned} +\delta_j^{out}&, a_k^{in}=z_j^{out}\\ +0&, otherwise +\end{aligned} +\right. +$$ +**#4 (卷积层的)输入层的误差敏感项** +在原始模型的反向传播中实际用不到本层的敏感项,神经网络某一层的敏感项实际是用于上一层的权重和偏置的更新。但为修改神经网络结构做铺垫,还是推导一下。卷积层的推导是CNN中的重点难点,参考博客[卷积神经网络(CNN)反向传播算法](https://www.cnblogs.com/pinard/p/6494810.html)。 +$$\begin{aligned} +\delta &= \frac{\partial C}{\partial a^{in}}=(\frac{\partial z^{out}}{\partial a^{in}})^T\frac{\partial C}{\partial z^{out}} \\ +&=(\frac{\partial z^{out}}{\partial a^{in}})^T\frac{\partial C}{\partial ReLU(z^{out})}ReLU'(z^{out})\\ +&=(\frac{\partial z^{out}}{\partial a^{in}})^T\delta'ReLU'(z^{out}) +\end{aligned}$$ +只给出这个式子还看不太清晰,举一个简单的例子分析一下。假设我们卷积层的输出$a^{in}$是一个3x3的矩阵,卷积核$W$是一个2x2矩阵,卷积的步长为1,则输出$z^{out}$是一个2x2的矩阵。为了简化假设$b$都为0,则有: +$$\left(\begin{array}{ccc} + a_{11} & a_{12} & a_{13}\\ + a_{21} & a_{22} & a_{23}\\ + a_{31} & a_{32} & a_{33}\\ +\end{array}\right)* +\left(\begin{array}{cc} + w_{11} & w_{12}\\ + w_{21} & w_{22}\\ +\end{array}\right)= +\left(\begin{array}{cc} + z_{11} & z_{12}\\ + z_{21} & z_{22}\\ +\end{array}\right)$$展开之后的形式:$$ +z_{11}=a_{11}w_{11}+a_{12}w_{12}+a_{21}w_{21}+a_{22}w_{22} \\ +z_{12}=a_{12}w_{11}+a_{13}w_{12}+a_{22}w_{21}+a_{23}w_{22} \\ +z_{21}=a_{21}w_{11}+a_{22}w_{12}+a_{31}w_{21}+a_{32}w_{22} \\ +z_{22}=a_{22}w_{11}+a_{23}w_{12}+a_{32}w_{21}+a_{33}w_{22} +$$分别计算各项的偏导,这里$\nabla a_{ij}=\delta^{in}_{ij}$,结果如下: +$$\begin{aligned} +\nabla a_{11}&=\delta_{11}w_{11} \\ +\nabla a_{12}&=\delta_{11}w_{12}+\delta_{12}w_{11} \\ +\nabla a_{13}&=\delta_{12}w_{12} \\ +\nabla a_{21}&=\delta_{11}w_{21}+\delta_{21}w_{11} \\ +\nabla a_{22}&=\delta_{11}w_{22}+\delta_{12}w_{21}+\delta_{21}w_{12}+\delta_{22}w_{11} \\ +\nabla a_{23}&=\delta_{12}w_{22}+\delta_{22}w_{12} \\ +\nabla a_{31}&=\delta_{21}w_{21} \\ +\nabla a_{32}&=\delta_{21}w_{22}+\delta_{22}w_{21} \\ +\nabla a_{33}&=\delta_{22}w_{22} +\end{aligned}$$ +上面的式子其实可以用一个矩阵卷积的形式表示,即: +$$\left(\begin{array}{ccc} + 0 & 0 & 0 & 0 \\ + 0 & \delta{11} & \delta{12} & 0\\ + 0 & \delta{21} & \delta{22} & 0\\ + 0 & 0 & 0 & 0 \\ +\end{array}\right)* +\left(\begin{array}{cc} + w_{22} & w_{21}\\ + w_{12} & w_{11}\\ +\end{array}\right)= +\left(\begin{array}{ccc} + \nabla a_{11} & \nabla a_{12} & \nabla a_{13}\\ + \nabla a_{21} & \nabla a_{22} & \nabla a_{23}\\ + \nabla a_{31} & \nabla a_{32} & \nabla a_{33}\\ +\end{array}\right) +$$我们可以观察和总结出卷积层输入层的敏感项公式实际为(原博客对于ReLU的导数描述是错的): +$$ +\delta^{in}= pad_1(\delta^{out})*rot_{180}(W)\odot ReLU'(z^{out}) \\ +ReLU'(z^{out})=\left\{ +\begin{aligned} +1&,z^{out}>0\\ +0&, otherwise +\end{aligned} +\right. +$$ +##### 权重与偏置的更新 +对于神经网络的参数更新,模型使用的是最简单的梯度下降法,其中$\eta$被称作学习率,控制每次参数更新的幅度,也反映了神经网络收敛的速度: +$$\left\{ +\begin{aligned} +w&=w-\eta \frac{\partial C}{\partial w} \\ +b&=b-\eta \frac{\partial C}{\partial b} \\ +\end{aligned} +\right.$$ +**#1 全连接层权重的更新** +$$\begin{aligned} +&\because \frac{\partial C}{\partial w_{ji}}=\frac{\partial C}{\partial y_j}\frac{\partial y_j}{\partial w_{ji}}=\delta_jx_{i}\\ +&\therefore w_{ji}=w_{ji}-\eta \delta_jx_{i} +\end{aligned}$$ +**#2 全连接层偏置的更新** +$$\begin{aligned} +&\because \frac{\partial C}{\partial b_j}=\frac{\partial C}{\partial y_j}\frac{\partial y_j}{\partial b_j}=\delta_j\\ +&\therefore b_j=b_j-\eta \delta_j +\end{aligned}$$ +**#3 卷积核权重的更新** +假设我们输入$a$是4x4的矩阵,卷积核$W$是3x3的矩阵,输出$z$是2x2的矩阵,那么反向传播的$z$的敏感项$\delta$也是2x2的矩阵。逐项计算可以得到以下四个式子: +$$ +\frac{\partial C}{w_{11}}=a_{11}\delta_{11}+a_{12}\delta_{12}+a_{21}\delta_{21}+a_{22}\delta_{22}\\ +\frac{\partial C}{w_{12}}=a_{12}\delta_{11}+a_{13}\delta_{12}+a_{22}\delta_{21}+a_{23}\delta_{22}\\ +\frac{\partial C}{w_{13}}=a_{13}\delta_{11}+a_{14}\delta_{12}+a_{23}\delta_{21}+a_{24}\delta_{22}\\ +\frac{\partial C}{w_{21}}=a_{21}\delta_{11}+a_{22}\delta_{12}+a_{31}\delta_{21}+a_{32}\delta_{22}\\ +$$总结其中规律可以发现:$$ +\frac{\partial C}{\partial w_{pq}}=\sum^{out.L}_{i=0}\sum^{out.W}_{j=0}\delta^{out}_{ij}x^{in}_{i+p,j+q} +$$其实上式就是卷积的公式,可以写成矩阵卷积的形式:$$ +\frac{\partial C}{\partial W}=a^{in}*\delta^{out} +$$ + +**#4 卷积核偏置的更新** +需要注意的是卷积层的偏置是对于整个卷积核而言的,如下面这个动图(卷积层演示,来自[网站](http://cs231n.github.io/assets/conv-demo/index.html))所显示的,有几个卷积核,就有几个偏置项,所以卷积层的偏置是一个长度为卷积核数的一维向量。 +![卷积层演示](https://img-blog.csdnimg.cn/20191227002511777.gif#pic_center =500x400) +对于第$k$个卷积核,有下式:$$ +\frac{\partial C}{\partial b_k}=\sum^{out.L}_{i=0}\sum^{out.W}_{j=0}\frac{\partial C}{\partial z_{ij}}\frac{\partial z_{ij}}{\partial b_k}=\sum^{out.L}_{i=0}\sum^{out.W}_{j=0}\delta^{out}_{ij} +$$ \ No newline at end of file diff --git a/t10k-images.idx3-ubyte b/dataset/t10k-images.idx3-ubyte similarity index 100% rename from t10k-images.idx3-ubyte rename to dataset/t10k-images.idx3-ubyte diff --git a/t10k-labels.idx1-ubyte b/dataset/t10k-labels.idx1-ubyte similarity index 100% rename from t10k-labels.idx1-ubyte rename to dataset/t10k-labels.idx1-ubyte diff --git a/train-images.idx3-ubyte b/dataset/train-images.idx3-ubyte similarity index 100% rename from train-images.idx3-ubyte rename to dataset/train-images.idx3-ubyte diff --git a/train-labels.idx1-ubyte b/dataset/train-labels.idx1-ubyte similarity index 100% rename from train-labels.idx1-ubyte rename to dataset/train-labels.idx1-ubyte diff --git a/include/dataio.h b/include/dataio.h new file mode 100644 index 0000000..83bf37c --- /dev/null +++ b/include/dataio.h @@ -0,0 +1,71 @@ +// +// Created by MAC on 2019/12/27. +// + +#ifndef CNN_DATAIO_H +#define CNN_DATAIO_H + +#include +#include +#include "vector.h" + +int ReverseInt(int i) { + unsigned char ch1 = i & 255; + unsigned char ch2 = i >> 8 & 255; + unsigned char ch3 = i >> 16 & 255; + unsigned char ch4 = i >> 24 & 255; + return((int)ch1 << 24) + ((int)ch2 << 16) + ((int)ch3 << 8) + ch4; +} +Vector1D read_Mnist_Label(const char * fileName) { + Vector1D labels = Vector1D_(); + FILE* file = fopen(fileName, "rb"); + assert(file != NULL); + int tmp; + if (file) { + int magic_number = 0, number_of_images = 0; + tmp = fread(&magic_number, sizeof(magic_number), 1, file); + tmp = fread(&number_of_images, sizeof(number_of_images), 1, file); + magic_number = ReverseInt(magic_number); + number_of_images = ReverseInt(number_of_images); + labels.new(&labels, number_of_images); + for (int i = 0; i < number_of_images; i++) { + unsigned char label = 0; + tmp = fread(&label, sizeof(label), 1, file); + labels.data[i] = (double)label; + } + } + return labels; +} +Vector2D read_Mnist_Images(const char * fileName) { + Vector2D images = Vector2D_(); + FILE* file = fopen(fileName, "rb"); + assert(file != NULL); + int tmp; + if (file) { + int magic_number = 0; + int number_of_images = 0; + int n_rows = 0; + int n_cols = 0; + tmp = fread(&magic_number, sizeof(magic_number), 1, file); + tmp = fread(&number_of_images, sizeof(number_of_images), 1, file); + tmp = fread(&n_rows, sizeof(n_rows), 1, file); + tmp = fread(&n_cols, sizeof(n_cols), 1, file); + magic_number = ReverseInt(magic_number); + number_of_images = ReverseInt(number_of_images); + n_rows = ReverseInt(n_rows); + n_cols = ReverseInt(n_cols); + images.new(&images, number_of_images, n_rows * n_cols); + for (int i = 0; i < number_of_images; i++) { + for (int r = 0; r < n_rows; r++) { + for (int c = 0; c < n_cols; c++) { + unsigned char image = 0; + tmp = fread(&image, sizeof(image), 1, file); + images.data[i][r * n_rows + c] = (double)image; + } + } + } + } + return images; +} + +#endif //CNN_DATAIO_H diff --git a/vector.h b/include/vector.h similarity index 100% rename from vector.h rename to include/vector.h diff --git a/main.c b/src/main.c similarity index 56% rename from main.c rename to src/main.c index 8676e2f..38d2d31 100644 --- a/main.c +++ b/src/main.c @@ -2,8 +2,8 @@ #include #include #include -#include -#include "vector.h" +#include "../include/vector.h" +#include "../include/dataio.h" #define _type double @@ -11,9 +11,9 @@ #define _min(a,b) (((a)>(b))?(b):(a)) const int train_number = 20000; // 训练样本数 -const int test_number = 5000; // 测试样本数 +const int test_number = 4000; // 测试样本数 const int out = 10; // 分类种类数 -const _type alpha = 0.01; // 学习率 +const int epoch = 200; int step; Vector1D labels_train; @@ -21,6 +21,27 @@ Vector2D images_train; Vector1D labels_test; Vector2D images_test; +const double PI = 3.141592654; +const double mean = 0; +const double sigma = 0.1; +double gaussrand() { + static double U, V; + static int phase = 0; + double Z; + if (phase == 0) { + U = rand() / (RAND_MAX + 1.0); + V = rand() / (RAND_MAX + 1.0); + Z = sqrt(-2.0 * log(U)) * sin(2.0 * PI * V); + } else { + Z = sqrt(-2.0 * log(U)) * cos(2.0 * PI * V); + } + phase = 1 - phase; + return mean + Z * sigma; +} +double alpha() { + return 0.1 * exp(-0.023 * step); +} + typedef struct { // 定义卷积网络中的层 // L, W, H 分别代表m三个维度的大小 // L x W x H的卷积核,H是通道数 @@ -49,7 +70,7 @@ Fcnn_layer * Fcnn_layer_() { Fcnn_layer * fcnn_layer = (Fcnn_layer *)malloc(sizeof(Fcnn_layer)); for (int i = 0; i < 20; ++i) for (int j = 0; j < 1000; ++j) - fcnn_layer->w[i][j] = 0.01 * (rand() % 100); + fcnn_layer->w[i][j] = gaussrand(); return fcnn_layer; } typedef struct { //定义CNN @@ -76,80 +97,14 @@ Network * Network_() { //权重初始化 Network * CNN; -Layer * conv(Layer * A, Layer * B[], int number, Layer * C); -Layer * CNN_Input(int num, Layer * A, int flag); -Fcnn_layer * Classify_input(Layer * A, Fcnn_layer * B);//将卷积提取特征输入到全连接神经网络 -Layer * pool_input(Layer * A, Fcnn_layer * B);//全连接层的误差项传递到CNN中 -Layer * pool_delta(Layer * A, Layer * B);//当前层为池化层的敏感项传递 -Fcnn_layer * softmax(Fcnn_layer * A);//softmax函数 -_type Relu(_type x);//Relu函数 -Layer * Update(Layer * A, Layer * B, Layer * C);//filter更新 -Layer * maxpooling(Layer * conv_layer, Layer * A);//池化前向输出 -Fcnn_layer * fcnn_Mul(Fcnn_layer * A, Fcnn_layer * B, Fcnn_layer * C);//全连接层前向输出 -_type sum(Layer * A);//矩阵求和,此处用于敏感项求和 -_type sum1(Layer * A, Layer * B, int x, int y); -_type sigmod(_type x); -void test(); -int test_out(int t); - -/**************************此段为读取MNIST数据集模块**************/ -int ReverseInt(int i) { - unsigned char ch1 = i & 255; - unsigned char ch2 = i >> 8 & 255; - unsigned char ch3 = i >> 16 & 255; - unsigned char ch4 = i >> 24 & 255; - return((int)ch1 << 24) + ((int)ch2 << 16) + ((int)ch3 << 8) + ch4; -} -Vector1D read_Mnist_Label(const char * fileName) { - Vector1D labels = Vector1D_(); - FILE* file = fopen(fileName, "rb"); - assert(file != NULL); - if (file) { - int magic_number = 0, number_of_images = 0; - fread(&magic_number, sizeof(magic_number), 1, file); - fread(&number_of_images, sizeof(number_of_images), 1, file); - magic_number = ReverseInt(magic_number); - number_of_images = ReverseInt(number_of_images); - labels.new(&labels, number_of_images); - for (int i = 0; i < number_of_images; i++) { - unsigned char label = 0; - fread(&label, sizeof(label), 1, file); - labels.data[i] = (_type)label; - } - } - return labels; +// ReLU函数 +_type ReLU(_type x) { + return _max(0.0, x); } -Vector2D read_Mnist_Images(const char * fileName) { - Vector2D images = Vector2D_(); - FILE* file = fopen(fileName, "rb"); - assert(file != NULL); - if (file) { - int magic_number = 0; - int number_of_images = 0; - int n_rows = 0; - int n_cols = 0; - fread(&magic_number, sizeof(magic_number), 1, file); - fread(&number_of_images, sizeof(number_of_images), 1, file); - fread(&n_rows, sizeof(n_rows), 1, file); - fread(&n_cols, sizeof(n_cols), 1, file); - magic_number = ReverseInt(magic_number); - number_of_images = ReverseInt(number_of_images); - n_rows = ReverseInt(n_rows); - n_cols = ReverseInt(n_cols); - images.new(&images, number_of_images, n_rows * n_cols); - for (int i = 0; i < number_of_images; i++) { - for (int r = 0; r < n_rows; r++) { - for (int c = 0; c < n_cols; c++) { - unsigned char image = 0; - fread(&image, sizeof(image), 1, file); - images.data[i][r * n_rows + c] = (_type)image; - } - } - } - } - return images; + +_type sigmoid(_type x) { + return 1.0 / (1.0 + exp(-x)); } -/**************************************************************/ /** * 卷积函数,表示卷积层A与number个filterB相卷积 @@ -160,7 +115,7 @@ Vector2D read_Mnist_Images(const char * fileName) { * @return */ // CNN->conv_layer1 = conv(CNN->Input_layer, CNN->filter1, 5, CNN->conv_layer1); -Layer * conv(Layer * A, Layer * B[], int number, Layer * C) { +void conv(Layer * A, Layer * B[], int number, Layer * C) { memset(C->m, 0, sizeof(C->m)); // 5个5x5x1的卷积核 for (int i = 0; i < number; ++ i) { @@ -177,11 +132,10 @@ Layer * conv(Layer * A, Layer * B[], int number, Layer * C) { for (int b = 0; b < B[0]->W; ++ b) for (int k = 0; k < A->H; ++ k) C->m[i][j][num] += A->m[i + a][j + b][k] * B[num]->m[a][b][k]; - C->m[i][j][num] = Relu(C->m[i][j][num] + C->b[i][j][num]); + C->m[i][j][num] = ReLU(C->m[i][j][num] + C->b[i][j][num]); } } } - return C; } /** @@ -191,7 +145,7 @@ Layer * conv(Layer * A, Layer * B[], int number, Layer * C) { * @param flag 0代表训练,1代表测试 * @return */ -Layer * CNN_Input(int num, Layer * A, int flag) { +void CNN_Input(int num, Layer * A, int flag) { A->L = A->W = 28; A->H = 1; int x = 0; // 数据集中的data是一维存储的,需要变回二维(实际是三维,通道那一维的长度为1) @@ -205,7 +159,6 @@ Layer * CNN_Input(int num, Layer * A, int flag) { } } } - return A; } /** @@ -214,7 +167,7 @@ Layer * CNN_Input(int num, Layer * A, int flag) { * @param B 全连接层的输入 * @return */ -Fcnn_layer * Classify_input(Layer * A, Fcnn_layer * B) { +void Classify_input(Layer * A, Fcnn_layer * B) { int x = 0; // 这里又是一个reshape操作,把三维参数展成一维的 // TODO: B->m[0] = 1.0是什么意思 @@ -223,34 +176,31 @@ Fcnn_layer * Classify_input(Layer * A, Fcnn_layer * B) { for (int j = 0; j < A->W; ++ j) for (int k = 0; k < A->H; ++ k) // TODO: 这里用sigmoid激活会不会不太好 - B->m[x ++] = sigmod(A->m[i][j][k]); + B->m[x ++] = sigmoid(A->m[i][j][k]); B->length = x; - return B; } // 全连接层的误差项传递到CNN中 -Layer * pool_input(Layer * A, Fcnn_layer * B) { +void pool_input(Layer * A, Fcnn_layer * B) { // 这里又是一个reshape操作,把fcnn_input层的参数reshape回pool_layer1的维度 int x = 1; for (int i = 0; i < A->L; ++ i) for (int j = 0; j < A->W; ++ j) for (int k = 0; k < A->H; ++ k) A->delta[i][j][k] = B->delta[x ++]; - return A; } // 当前层为池化层的敏感项传递 -Layer * pool_delta(Layer * A, Layer * B) { +void pool_delta(Layer * A, Layer * B) { for (int k = 0; k < A->H; ++ k) { for (int i = 0; i < A->L; i += 2) { for (int j = 0; j < A->W; j += 2) { // 如果输入输出之差的绝对值小于0.01,认为该位置时max,传递delta;否则delta为0,不更新参数 - if (fabs(A->m[i][j][k] - B->m[i / 2][j / 2][k]) < 0.01) + if (fabs(A->m[i][j][k] - B->m[i / 2][j / 2][k]) < 0.001) A->delta[i][j][k] = B->delta[i / 2][j / 2][k]; else A->delta[i][j][k] = 0; } } } - return A; } /** * 2x2的滤波器最大池化 @@ -259,7 +209,7 @@ Layer * pool_delta(Layer * A, Layer * B) { * @return */ // TODO: 池化窗口大小不通用 -Layer * maxpooling(Layer * conv_layer, Layer * A) { +void maxpooling(Layer * conv_layer, Layer * A) { A->L = conv_layer->L / 2; A->W = conv_layer->W / 2; A->H = conv_layer->H; @@ -267,8 +217,7 @@ Layer * maxpooling(Layer * conv_layer, Layer * A) { for (int i = 0; i < conv_layer->L; i += 2) for (int j = 0; j < conv_layer->W; j += 2) A->m[i / 2][j / 2][k] = _max(_max(conv_layer->m[i][j][k], conv_layer->m[i + 1][j][k]), - _max(conv_layer->m[i][j + 1][k],conv_layer->m[i + 1][j + 1][k])); - return A; + _max(conv_layer->m[i][j + 1][k],conv_layer->m[i + 1][j + 1][k])); } /** * 全连接层参数乘法 @@ -277,7 +226,7 @@ Layer * maxpooling(Layer * conv_layer, Layer * A) { * @param C 输出 * @return */ -Fcnn_layer * fcnn_Mul(Fcnn_layer * A, Fcnn_layer * B, Fcnn_layer * C) { +void fcnn_Mul(Fcnn_layer * A, Fcnn_layer * B, Fcnn_layer * C) { memset(C->m, 0, sizeof(C->m)); C->length = out; for (int i = 0; i < C->length; ++ i) { @@ -286,52 +235,40 @@ Fcnn_layer * fcnn_Mul(Fcnn_layer * A, Fcnn_layer * B, Fcnn_layer * C) { } C->m[i] += B->b[i]; } - return C; -} -_type sigmod(_type x) { - return 1.0 / (1.0 + exp(-x)); } // 矩阵求和,此处用于敏感项求和 // 注意卷积的偏置是对于每个卷积核而言的 // 5个卷积核,那么偏置就是长度为5的向量 -_type sum(Layer * A) { +_type sum(Layer * A, int z) { _type a = 0; for (int i = 0; i < A->L; ++ i) for (int j = 0; j < A->W; ++ j) - for (int k = 0; k < A->H; ++ k) - a += A->delta[i][j][k]; + a += A->delta[i][j][z]; return a; } // 其实这里就是B->m * A->delta卷积的结果 -_type sum1(Layer * A, Layer * B, int x, int y) { +_type sum1(Layer * A, Layer * B, int x, int y, int z) { _type a = 0; for (int i = 0; i < A->L; ++ i) for (int j = 0; j < A->W; ++ j) - for (int k = 0; k < A->H; ++ k) - a += A->delta[i][j][k] * B->m[i + x][j + y][k]; + a += A->delta[i][j][z] * B->m[i + x][j + y][z]; return a; } // filter更新 // TODO: 这里的sum(C)函数明显欠优化 -Layer * Update(Layer * A, Layer * B, Layer * C) { +void Update(Layer * A, Layer * B, Layer * C, int z) { + int sum_C = sum(C, z); for (int i = 0; i < A->L; ++ i) { for (int j = 0; j < A->W; ++ j) { for (int k = 0; k < A->H; ++ k) { -// A->m[i][j][k] -= alpha * sum1(A, B, i, j); - A->m[i][j][k] -= alpha * sum1(C, B, i, j); -// C->b[i][j][k] -= alpha * sum(A); - C->b[i][j][k] -= alpha * sum(C); + A->m[i][j][k] -= alpha() * sum1(C, B, i, j, z); + C->b[i][j][k] -= alpha() * sum_C; } } } - return A; -} -// ReLU函数 -_type Relu(_type x) { - return _max(0.0, x); } /** @@ -340,12 +277,11 @@ _type Relu(_type x) { * @return */ // TODO: 这里的softmax公式减了一个最大值,不知道为啥这么做 -Fcnn_layer * softmax(Fcnn_layer * A) { +void softmax(Fcnn_layer * A) { _type sum = 0.0; _type maxi = -100000000; for (int i = 0; i < out; ++ i) maxi = _max(maxi,A->m[i]); for (int i = 0; i < out; ++ i) sum += exp(A->m[i] - maxi); for (int i = 0; i < out; ++ i) A->m[i] = exp(A->m[i] - maxi) / sum; - return A; } /** * 做一次前向输出 @@ -353,22 +289,17 @@ Fcnn_layer * softmax(Fcnn_layer * A) { * @param flag 0代表训练,1代表测试 */ void forward_propagation(int num, int flag) { - CNN->Input_layer = CNN_Input(num, CNN->Input_layer, flag); - CNN->conv_layer1 = conv(CNN->Input_layer, CNN->filter1, 5, CNN->conv_layer1); - CNN->pool_layer1 = maxpooling(CNN->conv_layer1, CNN->pool_layer1); - //CNN->conv_layer2=conv(CNN->pool_layer1,CNN->filter2,3,CNN->conv_layer2,0); - //CNN->pool_layer2=maxpooling(CNN->conv_layer2,CNN->pool_layer2); - CNN->fcnn_input = Classify_input(CNN->pool_layer1, CNN->fcnn_input); - //for(int i=0;ifcnn_input->length;i++) printf("%.5f ",CNN->fcnn_input->m[i]); - CNN->fcnn_output = fcnn_Mul(CNN->fcnn_input, CNN->fcnn_w, CNN->fcnn_output); - CNN->fcnn_output = softmax(CNN->fcnn_output); + CNN_Input(num, CNN->Input_layer, flag); + conv(CNN->Input_layer, CNN->filter1, 5, CNN->conv_layer1); + maxpooling(CNN->conv_layer1, CNN->pool_layer1); + Classify_input(CNN->pool_layer1, CNN->fcnn_input); + fcnn_Mul(CNN->fcnn_input, CNN->fcnn_w, CNN->fcnn_output); + softmax(CNN->fcnn_output); // 这个delta原来算的是预测分布与真实分布之差 for (int i = 0; i < out; ++ i) { if (i == (int)labels_train.data[num]) CNN->fcnn_output->delta[i] = CNN->fcnn_output->m[i] - 1.0; else CNN->fcnn_output->delta[i] = CNN->fcnn_output->m[i]; - //printf("%.5f ",CNN->fcnn_output->m[i]); } - //printf(" %.0f\n",labels[num]); } /** @@ -379,72 +310,68 @@ void back_propagation() { for (int i = 0; i < CNN->fcnn_input->length; ++ i) { for (int j = 0; j < out; ++ j) { CNN->fcnn_input->delta[i] += CNN->fcnn_input->m[i] * (1.0 - CNN->fcnn_input->m[i]) - * CNN->fcnn_w->w[j][i] * CNN->fcnn_output->delta[j]; + * CNN->fcnn_w->w[j][i] * CNN->fcnn_output->delta[j]; } } for (int i = 0; i < CNN->fcnn_input->length; ++ i) { for (int j = 0; j < out; ++ j) { - CNN->fcnn_w->w[j][i] -= alpha * CNN->fcnn_output->delta[j] * CNN->fcnn_input->m[i]; - CNN->fcnn_w->b[j] -= alpha * CNN->fcnn_output->delta[j]; + CNN->fcnn_w->w[j][i] -= alpha() * CNN->fcnn_output->delta[j] * CNN->fcnn_input->m[i]; + CNN->fcnn_w->b[j] -= alpha() * CNN->fcnn_output->delta[j]; } } - CNN->pool_layer1 = pool_input(CNN->pool_layer1, CNN->fcnn_input); - CNN->conv_layer1 = pool_delta(CNN->conv_layer1, CNN->pool_layer1); // pooling误差传递 - //CNN->pool_layer1=conv(CNN->conv_layer1,CNN->filter2,3,CNN->pool_layer1,1);//conv误差传递 + pool_input(CNN->pool_layer1, CNN->fcnn_input); + pool_delta(CNN->conv_layer1, CNN->pool_layer1); // pooling误差传递 for (int i = 0; i < 5; ++ i) - CNN->filter1[i] = Update(CNN->filter1[i], CNN->Input_layer, CNN->conv_layer1); + Update(CNN->filter1[i], CNN->Input_layer, CNN->conv_layer1, i); } + +int test_out(int t) { + forward_propagation(t,1); + // argmax() + int ans = -1; + _type sign = -1; + for (int i = 0; i < out; ++ i) { + if (CNN->fcnn_output->m[i] > sign) { + sign = CNN->fcnn_output->m[i]; + ans = i; + } + } + return ans; +} + +void test() { + int sum = 0; + for (int i = 0; i < test_number; ++ i) { + if (test_out(i) == (int)labels_test.data[i]) sum ++; + } + printf("prec: %.5f\n", 1.0 * sum / test_number); + ++ step; +} + void train() { step = 0; - for(int time = 0; time < 100; ++ time) { + for(int time = 0; time < epoch; ++ time) { _type err = 0; for(int i = 0; i < train_number; i ++) { // 遍历每一个训练样本 forward_propagation(i, 0); - // 交叉熵损失函数,P((int)labels_train.data[i]) = 1, P(其它类别) = 0, 所以只剩下Q(x) + // 交叉熵损失函数,P((int)labels_train.dataset[i]) = 1, P(其它类别) = 0, 所以只剩下Q(x) err -= log(CNN->fcnn_output->m[(int)labels_train.data[i]]); back_propagation(); } - printf("step: %d loss:%.5f\n", time, 1.0 * err / train_number);//每次记录一遍数据集的平均误差 + printf("step: %3d loss: %.5f ", time, 1.0 * err / train_number);//每次记录一遍数据集的平均误差 test(); } } -void test() { - int sum=0; - for(int i=0;ifcnn_output->m[i]>sign) - { - sign=CNN->fcnn_output->m[i]; - ans=i; - } - } - return ans; -} int main() { // 初始化CNN存储结构 CNN = Network_(); // 读取数据集 - labels_train = read_Mnist_Label("..\\train-labels.idx1-ubyte"); - images_train = read_Mnist_Images("..\\train-images.idx3-ubyte"); - labels_test = read_Mnist_Label("..\\t10k-labels.idx1-ubyte"); - images_test = read_Mnist_Images("..\\t10k-images.idx3-ubyte"); + labels_train = read_Mnist_Label("../dataset/train-labels.idx1-ubyte"); + images_train = read_Mnist_Images("../dataset/train-images.idx3-ubyte"); + labels_test = read_Mnist_Label("../dataset/t10k-labels.idx1-ubyte"); + images_test = read_Mnist_Images("../dataset/t10k-images.idx3-ubyte"); // 对图片像素进行归一化 for (int i = 0; i < images_train.rows; ++ i) @@ -464,4 +391,4 @@ int main() { labels_test.destroy(&labels_test); images_test.destroy(&images_test); return 0; -} \ No newline at end of file +} diff --git a/test.c b/test.c deleted file mode 100644 index 62e1694..0000000 --- a/test.c +++ /dev/null @@ -1,10 +0,0 @@ -// -// Created by MAC on 2019/12/1. -// - -#include -#include - -int main(void) { - return 0; -} \ No newline at end of file