【机器学习】C++手撸一个BP神经网络(支持minibatch,Adam优化)

封面

每个函数以及变量的含义已经写在注释中。
使用数据:点击下载
测试结果:(第一列为训练集acc,第二列为测试集acc,每行的行数为神经网络层数,每层20个神经元)
结果

测试的时候层数不能太深,容易梯度爆炸
所需头文件:

#include <iostream>
#include<vector>
#include<algorithm>
#include<cmath>
#include<fstream>
#include<sstream>

BP类定义:

class BP
{
private:
    vector<vector<vector<double>>> w;//需要学习的权重
    vector<vector<double>>b;//每层神经网络的偏置
    int layers;//层数
    vector<int>nl;//每层神经元数量
    int m;//样本总数
    int act;//激活函数种类
    double c;//正则化系数
    double a;//学习率
    int seed;//随机种子
    bool adam;//是否使用adam优化
    double testp;//训练集所占比重
    vector<vector<double>>za;//记录每个神经元激活值
    vector<vector<double>>zz;//记录每个神经元输入值
    vector<vector<double>>dz;//记录每个神经元输入的偏导

    double forward(double **x, int *y, int q); //第q个样本前向传播
    void back(double **x, int *y, int q); //第q个样本反向传播
    double relu(double x) { return max(0.0, x); } //以下为三种激活函数以及其导函数
    double drelu(double x) { return x >= 0 ? 1 : 0; }
    double sigmod(double x) { return 1 / (1 + exp(-x)); }
    double dsigmod(double x) { return sigmod(x)*(1 - sigmod(x)); }
    double tanh(double x) { return std::tanh(x); }
    double dtanh(double x) { return 1 - std::tanh(x)*std::tanh(x); }
public:
    BP(int layers, int m, int *nl, double a, double c, string act = "relu", double testp = 0.7, bool adam = false, int seed = 0);
    void init();
    void train(double **x, int *y, int minibatch); //开始训练
    double testacc(double**x, int *y); //计算测试集acc
    double trainacc(double**x, int *y);//计算训练集acc
    void norminput(double**x); //正则化输入
};

下面给出每个函数的定义:
构造函数,传进来层数不包括最后的输出层,所以在构造函数里加上,假设是二分类,所以最后一层只需要一个神经元。

BP::BP(int layers, int m, int *nl, double a, double c, string act, double testp, bool adam, int seed)
{
    this->layers = layers + 1;
    this->m = m;
    for (int i = 0; i < layers; i++)
        this->nl.emplace_back(nl[i]);
    this->nl.emplace_back(1);
    if (act == "relu")
        this->act = 1;
    else if (act == "tanh")
        this->act = 2;
    else if (act == "sigmod")
        this->act = 3;
    this->seed = seed;
    this->a = a;
    this->c = c;
    this->adam = adam;
    this->testp = testp;
}

初始化函数,各种申请空间和随机初始化,不解释

void BP::init()
{
    srand(seed);
    w = vector< vector<vector<double>>>(layers - 1);
    for (int i = 0; i < layers - 1; i++)
    {
        w[i] = vector<vector<double>>(nl[i + 1]);
        for (int j = 0; j < nl[i + 1]; j++)
        {
            w[i][j] = vector<double>(nl[i]);
            for (int k = 0; k < nl[i]; k++)
                w[i][j][k] = rand() % 1000 / (double)100000 - 0.005;
        }
    }

    b = vector<vector<double>>(layers - 1);
    for (int i = 0; i < layers - 1; i++)
    {
        b[i] = vector<double>(nl[i + 1]);
        for (int j = 0; j < nl[i + 1]; j++)
            b[i][j] = rand() % 1000 / (double)100000 - 0.005;
    }

    za = vector<vector<double>>(layers);
    for (int i = 0; i < layers; i++)
        za[i] = vector<double>(nl[i]);

    zz = vector<vector<double>>(layers);
    for (int i = 0; i < layers; i++)
        zz[i] = vector<double>(nl[i]);

    dz = vector<vector<double>>(layers);
    for (int i = 0; i < layers; i++)
        dz[i] = vector<double>(nl[i]);
}

单个样本前向传播,第一层就是x的输入,然后每层每个神经元依次计算输入与激活值,返回该样本的log损失误差,最后一层用sigmod计活函数

double BP::forward(double **x, int *y, int q)
{
    for (int i = 0; i < nl[0]; i++)
        za[0][i] = x[q][i];
    for (int i = 0; i < layers - 1; i++)
    {
        for (int j = 0; j < nl[i + 1]; j++)
        {
            double z = 0;
            for (int k = 0; k < nl[i]; k++)
                z += w[i][j][k] * za[i][k];
            z += b[i][j];
            zz[i + 1][j] = z;
            if (i == layers - 2)
                z = sigmod(z);
            else
            {
                switch (act)
                {
                case 1:z = relu(z); break;
                case 2:z = tanh(z); break;
                case 3:z = sigmod(z); break;
                }
            }
            za[i + 1][j] = z;
        }
    }
    double J = y[q] * log(za[layers - 1][0] + 1e-8) + (1 - y[q])*log(1 - za[layers - 1][0] + 1e-8);
    return J;
}

单个样本反向传播求导,最后一层的导数并不是定义为a-y,而是通过损失函数对a的导与sigmod函数对z的导相乘化简而来的

void BP::back(double**x, int *y, int q)
{
    dz[layers - 1][0] = za[layers - 1][0] - y[q];

    for (int i = layers - 2; i > 0; i--)
    {
        for (int j = 0; j < nl[i]; j++)
        {
            double dd = 0;
            for (int k = 0; k < nl[i + 1]; k++)
                dd += w[i][k][j] * dz[i + 1][k];
            switch (act)
            {
            case 1:dd *= drelu(zz[i][j]); break;
            case 2:dd *= dtanh(zz[i][j]); break;
            case 3:dd *= sigmod(zz[i][j]); break;
            }
            dz[i][j] = dd;
        }
    }
}

训练函数,如果minibatch大于m,就直接等于m,dnum为一轮的次数,每求完一轮,计算损失函数,与上一轮相比,相差不大于1e-6,就结束训练,如果训练200轮还没收敛,也结束训练
其中dw为损失函数对每层的w的偏导,db同理,vdw与vdb为dw与db的指数加权平均数,sdw与sdb为(dw)²与(db)²的指数加权平均数,这四个变量用来做adam优化的梯度下降

void BP::train(double **x, int *y, int minibatch)
{
    vector<vector<vector<double>>> dw(layers - 1);
    for (int i = 0; i < layers - 1; i++)
    {
        dw[i] = vector<vector<double>>(nl[i + 1]);
        for (int j = 0; j < nl[i + 1]; j++)
            dw[i][j] = vector<double>(nl[i]);
    }
    vector<vector<double>>db(layers - 1);
    for (int i = 0; i < layers - 1; i++)
        db[i] = vector<double>(nl[i + 1]);
    vector <vector<vector<double>>>vdw(layers - 1);
    for (int i = 0; i < layers - 1; i++)
    {
        vdw[i] = vector<vector<double>>(nl[i + 1]);
        for (int j = 0; j < nl[i + 1]; j++)
            vdw[i][j] = vector<double>(nl[i], 0);
    }
    vector<vector<double>>vdb(layers - 1);
    for (int i = 0; i < layers - 1; i++)
        vdb[i] = vector<double>(nl[i + 1], 0);
    vector <vector<vector<double>>>sdw(layers - 1);
    for (int i = 0; i < layers - 1; i++)
    {
        sdw[i] = vector<vector<double>>(nl[i + 1]);
        for (int j = 0; j < nl[i + 1]; j++)
            sdw[i][j] = vector<double>(nl[i], 0);
    }
    vector<vector<double>>sdb(layers - 1);
    for (int i = 0; i < layers - 1; i++)
        sdb[i] = vector<double>(nl[i + 1], 0);


    double Jlast = 0;
    double Jnow = 100000;
    int t = 1;
    int dnum = (int)(m*testp) % minibatch == 0 ? (m*testp) / minibatch : (m*testp) / minibatch + 1;
    int anum = 0;
    while (abs(Jlast - Jnow) > 1e-5)
    {

        Jlast = Jnow;
        Jnow = 0;
        int m1 = 0, m2 = min((int)(m*testp), m1 + minibatch);

        for (int ci = 0; ci < dnum; ci++)
        {
            for (int i = 0; i < layers - 1; i++)
                for (int j = 0; j < nl[i + 1]; j++)
                    for (int k = 0; k < nl[i]; k++)
                        dw[i][j][k] = 0;
            for (int i = 1; i < layers; i++)
                for (int j = 0; j < nl[i]; j++)
                    db[i - 1][j] = 0;
            //cout << "epoch " << t/dnum +1 << ": " << ci + 1 << "/" << dnum << endl;
            if (ci != 0)
                m1 += minibatch, m2 = min((int)(m*testp), m2 + minibatch);
            double J = 0;

            for (int i = m1; i < m2; i++)
            {

                double j = forward(x, y, i);
                //cout << j << endl;
                J += j;
                back(x, y, i);
                for (int i = 0; i < layers - 1; i++)
                    for (int j = 0; j < nl[i + 1]; j++)
                        for (int k = 0; k < nl[i]; k++)
                            dw[i][j][k] += dz[i + 1][j] * za[i][k];
                for (int i = 1; i < layers; i++)
                    for (int j = 0; j < nl[i]; j++)
                        db[i - 1][j] += dz[i][j];
            }

            J /= -(double)(m2 - m1);

            Jnow += J;
            for (int i = 0; i < layers - 1; i++)
                for (int j = 0; j < nl[i + 1]; j++)
                    for (int k = 0; k < nl[i]; k++)
                    {
                        dw[i][j][k] = dw[i][j][k] / (double)(m2 - m1) + w[i][j][k] * c / (m2 - m1);
                        if (!adam)
                            w[i][j][k] -= a * dw[i][j][k];
                        else
                        {
                            vdw[i][j][k] = 0.9*vdw[i][j][k] + 0.1*dw[i][j][k];
                            sdw[i][j][k] = 0.999*sdw[i][j][k] + 0.001*dw[i][j][k] * dw[i][j][k];
                            double vdwt = vdw[i][j][k] / (1 - pow(0.9, t));
                            double sdwt = sdw[i][j][k] / (1 - pow(0.999, t));
                            w[i][j][k] -= a * vdwt / sqrt(sdwt + 1e-8);
                        }
                    }
            for (int i = 1; i < layers; i++)
                for (int j = 0; j < nl[i]; j++)
                {
                    db[i - 1][j] = db[i - 1][j] / (double)(m2 - m1);
                    if (!adam)
                        b[i - 1][j] -= a * db[i - 1][j];
                    else
                    {
                        vdb[i - 1][j] = 0.9*vdb[i - 1][j] + 0.1*db[i - 1][j];
                        sdb[i - 1][j] = 0.999*sdb[i - 1][j] + 0.001*db[i - 1][j] * db[i - 1][j];
                        double vdbt = vdb[i - 1][j] / (1 - pow(0.9, t));
                        double sdbt = sdb[i - 1][j] / (1 - pow(0.999, t));
                        b[i - 1][j] -= a * vdbt / (sqrt(sdbt + 1e-8));
                        //cout << db[i - 1][j] << " " << vdbt / (sqrt(sdbt + 1e-8)) << endl;
                    }
                }
            t++;
        }

        //cout << Jnow << endl;
        double JC = 0;
        for (int i = 0; i < layers - 1; i++)
            for (int j = 0; j < nl[i + 1]; j++)
                for (int k = 0; k < nl[i]; k++)
                    JC += w[i][j][k] * w[i][j][k];
        JC *= c;
        JC /= 2 * (m*testp);
        Jnow += JC;

        //printf("%lf %lf\n", Jnow, Jlast);
        if (t / dnum >= 200)
            break;
    }
}

两个测试函数,没啥好说的

double BP::testacc(double**x, int *y)
{
    int m1 = (int)(m*testp);
    int tr = 0;
    for (int i = m1; i < m; i++)
    {
        forward(x, y, i);
        if (za[layers - 1][0] >= 0.5&&y[i] == 1)
            tr++;
        if (za[layers - 1][0] < 0.5&&y[i] == 0)
            tr++;
    }
    return tr / (double)(m - m1);
}
double BP::trainacc(double**x, int *y)
{
    int tr = 0;
    for (int i = 0; i < (int)(m*testp); i++)
    {
        forward(x, y, i);
        if (za[layers - 1][0] >= 0.5&&y[i] == 1)
            tr++;
        if (za[layers - 1][0] < 0.5&&y[i] == 0)
            tr++;
    }
    return tr / (double)(m*testp);
}

正则化输入,每个x减去均值并除以范围(标准差也可以)

void BP::norminput(double**x)
{
    for (int i = 0; i < nl[0]; i++)
    {
        double avg = 0;
        double maxx = 0;
        double minn = 99999999;
        for (int j = 0; j < m; j++)
        {
            avg += x[j][i];
            maxx = max(maxx, x[j][i]);
            minn = min(minn, x[j][i]);
        }
        avg /= m;
        for (int j = 0; j < m; j++)
            x[j][i] = (x[j][i] - avg) / (maxx - minn);
    }
}

最后是主函数,读入文件,尝试测试不同层数的神经网络

int main() {
    ifstream f;
    f.open("spambase.csv");
    if (!f.is_open())
    {
        cout << "open error" << endl;
        return 0;
    }
    string line;
    int m = 0;
    int n = 0;
    double **x = new double*[10000];
    int *y = new int[10000];
    while (getline(f, line))
    {
        int n1 = 0;
        x[m] = new double[1000];
        istringstream sin(line);
        string temp;
        while (getline(sin, temp, ','))
        {
            x[m][n1] = atof(temp.c_str());
            //cout << x[m][n1] << endl;
            n1++;
        }
        y[m] = x[m][n1 - 1] == -1 ? 0 : 1;

        m++;
        n = n1 - 1;
    }
    f.close();
    int *nl = new int[11];
    nl[0] = n;
    nl[1] = 20;
    nl[2] = 20;
    nl[3] = 20;
    nl[4] = 20;
    nl[5] = 20;
    nl[6] = 20;
    for (int i = 1; i <= 6; i++)
    {
        BP B(i, m, nl, 0.01, 0.0001, "tanh", 0.7, true, 0);
        B.init();
        B.norminput(x);
        B.train(x, y, 64);
        double trainacc = B.trainacc(x, y);
        double acc = B.testacc(x, y);
        printf("%lf %lf\n", trainacc, acc);
        //system("pause");
    }
}