【机器学习】C++手撸一个逻辑回归

flag为false时为测试集,为true时为训练集

J函数:logistics代价函数,为对数代价。

logistics函数:x特征,y标签,m1样本起始点,m2样本结束点,n特征数,a学习率,c正则化参数,返回n+1长度的数组,为训练好的参数。

kfold函数:x特征,y标签,n特征数,m样本数,k折数,返回k折平均acc。

测试数据: 点击下载
数据说明:第10列为标签,-1与1,前九列为特征。

代码运行结果:(5折交叉验证)
结果
代码:

#include <iostream>
#include<vector>
#include<queue>
#include<algorithm>
#include<string>
#include<vector>
#include<stack>
#include<cmath>
#include<set>
#include<string.h>
#include<fstream>
#include<sstream>
#include<ctime>
using namespace std;
bool flag[10000];
double J(double *s, double **x, int *y,int m1, int m2, int n,double c)
{
    double res = 0;
    int fa = 0;
    for (int i = m1; i < m2; i++)
    {
        if (!flag[i])
        {
            fa++;
            continue;
        }
        double z = -s[0];
        for (int j = 1; j <= n; j++)
            z -= s[j] * x[i][j-1];
        //cout << z << endl;
        double h = (double)1 / (1 + exp(z));
        //cout << h << endl;
        double cost = 0;
        if (y[i] == 1)
            cost = -log(h);
        else
            cost = -log(1 - h);
        //cout << cost << endl;
        res += cost;
    }
    res /= m2-m1-fa;
    double res2 = 0;
    for (int i = 1; i <= n; i++)
        res2 += s[i] * s[i];
    res2 *= c;
    res2 /= 2 * (m2 - m1 - fa);
    return res + res2;
}
double* logistic(double **x, int *y, int n,int m1, int m2,double a,double c)
{
    double *s = new double[n + 1];//θ
    srand(time(0));
    for (int i = 0; i < n + 1; i++)
        s[i] = rand()%100/(double)100;//随机初始化
    cout << s[0] << endl;
    double last = 0;
    double now = J(s, x, y, m1,m2, n, c);

    double *temp = new double[n + 1];
    while (abs(last - now) > 0.00001)
    {
        //cout <<  now << endl;
        for (int i = 0; i <= n; i++)
        {
            double mqiuhe = 0;
            int fa = 0;
            for (int k = m1; k < m2; k++)
            {
                if (!flag[k])
                {
                    fa++;
                    continue;
                }
                double z = -s[0];
                for (int j = 1; j <= n; j++)
                    z -= s[j] * x[k][j-1];
                double h = (double)1 / (1 + exp(z));
                h -= y[k];
                if(i!=0)
                    h *= x[k][i-1];
                mqiuhe += h;
            }
            mqiuhe /= m2-m1-fa;
            mqiuhe *= a;
            double C = 0;
            if(i!=0)
                C = c / (m2 - m1 - fa)*s[i],C *= a;

            temp[i] = s[i] - mqiuhe - C;
        }
        for (int i = 0; i <= n; i++)
            s[i] = temp[i];
        last = now;
        now = J(s, x, y, m1,m2, n, c);
    }
    return s;
}
double kfold(double **x, int *y, int n, int m, int k)
{
    double avg = 0;
    double d = m / k;
    for (int j = 0; j < k; j++)
    {
        for (int i = 0; i < m; i++)
            flag[i] = true;
        for (int i = d * j; i < d*j + d; i++)//测试集
            flag[i] = false;
        double *s = logistic(x, y, n, 0, m, 0.1, 0.01);//学习率0.1,正则化权重0.01
        int tr = 0;
        for (int i = d * j; i < d*j + d; i++)
        {
            double res = s[0];
            for (int j = 1; j <= n; j++)
                res += s[j] * x[i][j - 1];
            double h = (double)1 / (1 + exp(-res));
            if (h >= 0.5&&y[i] == 1)
                tr++;
            else if (h < 0.5&&y[i] == 0)
                tr++;
            //cout << h << endl;
        }
        //cout << tr << endl;
        double acc = tr / (double)d;
        avg += acc;
        cout << "第" << j +1 << "次acc值为:" << acc << endl;
    }

    return avg / k;
}
int main() {

    ifstream f;
    f.open("breast-w.csv");
    if (!f.is_open())
    {
        cout << "open error" << endl;
        return 0;
    }
    string line;
    int m = 0;
    int n = 0;
    double **x = new double*[10000];
    int *y = new int[10000];
    while (getline(f,line))   
    {    
        int n1 = 0;
        x[m] = new double[1000];
        istringstream sin(line); 
        string temp;
        while (getline(sin, temp, ',')) 
        {
            x[m][n1] = atof(temp.c_str());
            //cout << x[m][n1] << endl;
            n1++;
        }
        y[m] = x[m][n1 - 1]==-1?0:1;

        m++;
        n = n1-1;
    }

    double acc = kfold(x, y, n, m, 5);
    cout << "平均:" << acc << endl;
}