【机器学习】C++手撸一个逻辑回归
flag为false时为测试集,为true时为训练集
J函数:logistics代价函数,为对数代价。
logistics函数:x特征,y标签,m1样本起始点,m2样本结束点,n特征数,a学习率,c正则化参数,返回n+1长度的数组,为训练好的参数。
kfold函数:x特征,y标签,n特征数,m样本数,k折数,返回k折平均acc。
测试数据: 点击下载
数据说明:第10列为标签,-1与1,前九列为特征。
代码运行结果:(5折交叉验证)
代码:
#include <iostream>
#include<vector>
#include<queue>
#include<algorithm>
#include<string>
#include<vector>
#include<stack>
#include<cmath>
#include<set>
#include<string.h>
#include<fstream>
#include<sstream>
#include<ctime>
using namespace std;
bool flag[10000];
double J(double *s, double **x, int *y,int m1, int m2, int n,double c)
{
double res = 0;
int fa = 0;
for (int i = m1; i < m2; i++)
{
if (!flag[i])
{
fa++;
continue;
}
double z = -s[0];
for (int j = 1; j <= n; j++)
z -= s[j] * x[i][j-1];
//cout << z << endl;
double h = (double)1 / (1 + exp(z));
//cout << h << endl;
double cost = 0;
if (y[i] == 1)
cost = -log(h);
else
cost = -log(1 - h);
//cout << cost << endl;
res += cost;
}
res /= m2-m1-fa;
double res2 = 0;
for (int i = 1; i <= n; i++)
res2 += s[i] * s[i];
res2 *= c;
res2 /= 2 * (m2 - m1 - fa);
return res + res2;
}
double* logistic(double **x, int *y, int n,int m1, int m2,double a,double c)
{
double *s = new double[n + 1];//θ
srand(time(0));
for (int i = 0; i < n + 1; i++)
s[i] = rand()%100/(double)100;//随机初始化
cout << s[0] << endl;
double last = 0;
double now = J(s, x, y, m1,m2, n, c);
double *temp = new double[n + 1];
while (abs(last - now) > 0.00001)
{
//cout << now << endl;
for (int i = 0; i <= n; i++)
{
double mqiuhe = 0;
int fa = 0;
for (int k = m1; k < m2; k++)
{
if (!flag[k])
{
fa++;
continue;
}
double z = -s[0];
for (int j = 1; j <= n; j++)
z -= s[j] * x[k][j-1];
double h = (double)1 / (1 + exp(z));
h -= y[k];
if(i!=0)
h *= x[k][i-1];
mqiuhe += h;
}
mqiuhe /= m2-m1-fa;
mqiuhe *= a;
double C = 0;
if(i!=0)
C = c / (m2 - m1 - fa)*s[i],C *= a;
temp[i] = s[i] - mqiuhe - C;
}
for (int i = 0; i <= n; i++)
s[i] = temp[i];
last = now;
now = J(s, x, y, m1,m2, n, c);
}
return s;
}
double kfold(double **x, int *y, int n, int m, int k)
{
double avg = 0;
double d = m / k;
for (int j = 0; j < k; j++)
{
for (int i = 0; i < m; i++)
flag[i] = true;
for (int i = d * j; i < d*j + d; i++)//测试集
flag[i] = false;
double *s = logistic(x, y, n, 0, m, 0.1, 0.01);//学习率0.1,正则化权重0.01
int tr = 0;
for (int i = d * j; i < d*j + d; i++)
{
double res = s[0];
for (int j = 1; j <= n; j++)
res += s[j] * x[i][j - 1];
double h = (double)1 / (1 + exp(-res));
if (h >= 0.5&&y[i] == 1)
tr++;
else if (h < 0.5&&y[i] == 0)
tr++;
//cout << h << endl;
}
//cout << tr << endl;
double acc = tr / (double)d;
avg += acc;
cout << "第" << j +1 << "次acc值为:" << acc << endl;
}
return avg / k;
}
int main() {
ifstream f;
f.open("breast-w.csv");
if (!f.is_open())
{
cout << "open error" << endl;
return 0;
}
string line;
int m = 0;
int n = 0;
double **x = new double*[10000];
int *y = new int[10000];
while (getline(f,line))
{
int n1 = 0;
x[m] = new double[1000];
istringstream sin(line);
string temp;
while (getline(sin, temp, ','))
{
x[m][n1] = atof(temp.c_str());
//cout << x[m][n1] << endl;
n1++;
}
y[m] = x[m][n1 - 1]==-1?0:1;
m++;
n = n1-1;
}
double acc = kfold(x, y, n, m, 5);
cout << "平均:" << acc << endl;
}