http://blog.csdn.net/xceman1997/article/details/7955349

http://www.cnblogs.com/yuyang-DataAnalysis/archive/2012/01/31/2333760.html

http://zhan.renren.com/dmeryuyang?gid=3602888497999161050&checked=true

http://blog.csdn.net/yanqingan/article/details/6125812

bool NaiveBayes::Train (const char * sFileSample, int iClassNum, int iFeaTypeNum,
string & sSegmenter, int iFeaExtractNum, const char * sFileModel, bool bCompactModel)
{
// 防御性代码
if (iClassNum <= 0 || iFeaTypeNum <= 0 || iFeaExtractNum <= 0)
return false;

ifstream in (sFileSample, ios_base::binary);
ofstream out (sFileModel);
if (!in || !out)
{
cerr << "Can not open the file" << endl;
return false;
}

// 这些都是临时数据结构,用来临时存储模型参数,特征选择需要的参数等等
// 1. the temp data structure for model parameters
// 1.1 the total number of document in training samples
int iTotalDocNum = 0;
// 1.2 the prior probability of class, temparaly it store the doc number in this class
double * pClassPriorProb = new double [iClassNum];
memset (pClassPriorProb, 0, iClassNum*sizeof(double));
// 1.3 the prior probability of feature type, temparaly it stores the doc number in this feature (这个主要用于特征选择,bayes模型本身并不需要这个参数)
double * pFeaItemPriorProb = new double [iFeaTypeNum];
memset (pFeaItemPriorProb, 0, iFeaTypeNum*sizeof(double));
// 1.4 the chi-square value that feature falls into class, temparaly it stores the doc number for this class and feature (可以看到,特征选择算法主要用卡方选择)
double ** ppChiMatrix = new double * [iClassNum];
for (int i=0; i<iClassNum; i++)
{
ppChiMatrix[i] = new double [iFeaTypeNum];
memset (ppChiMatrix[i], 0, iFeaTypeNum*sizeof(double));
}
// 1.5 the post-probability for class and feature
double ** ppPProbMatrix = new double * [iClassNum];
for (int i=0; i<iClassNum; i++)
{
ppPProbMatrix[i] = new double [iFeaTypeNum];
memset (ppChiMatrix[i], 0, iFeaTypeNum*sizeof(double));
}
// 1.6 for the feature selection (表示哪些特征被选中了)
int * pFeaSelected = new int [iFeaTypeNum];
memset (pFeaSelected, 0, iFeaTypeNum*sizeof(int));

// 2. iterate the training samples and fill count into the temp data structure
string sLine;
int i = 0;
while (getline (in, sLine))
{
// show some information on screen
if (0 == i%10000)
cout << i << "\n";
i++;

// 2.1 the total number of doc
iTotalDocNum++;

// 2.2 split the sample into class and feature items
string::size_type iSeg = sLine.find_first_of (sSegmenter);
string sTmp = sLine.substr (0, iSeg);
int iClassId = atoi (sTmp.c_str());
if (iClassId >= iClassNum)
continue;
pClassPriorProb [iClassId]++;

// 2.3 count the rest feature items
iSeg += sTmp.length();
sTmp = sLine.substr (iSeg);
istringstream isLine (sTmp);
string sTmpItem;
while (isLine >> sTmpItem)
{
int iFeaItemId = atoi (sTmpItem.c_str());
if (iFeaItemId >= iFeaTypeNum)
continue;
// add the count
pFeaItemPriorProb [iFeaItemId]++;
ppChiMatrix [iClassId][iFeaItemId]++;

}
}

// 3. calculate the model parameters
// 3.1 the chi-square value as well as the post-probabilty
for (int i=0; i<iClassNum; i++)
{
for (int j=0; j<iFeaTypeNum; j++)
{
double dA = ppChiMatrix[i][j];
double dB = pFeaItemPriorProb[j] - dA; // currently pFeaItemPriorProb[i] == sum_i (ppChiMatrix[i][j])
double dC = pClassPriorProb [i] - dA; // currently pClassPriorProb[i] == sum_j (ppChiMatrix[i][j])
double dD = (double)iTotalDocNum - dA - dB - dC;

// the chi value
double dNumerator = dA * dD;
dNumerator -= dB * dC;
dNumerator = pow (dNumerator, 2.0);
double dDenominator = dA + dB;
dDenominator *= (dC + dD);
dDenominator += DBL_MIN; // for smoothing
ppChiMatrix[i][j] = dNumerator / dDenominator;

// the post-probability: p(feature|class)
ppPProbMatrix[i][j] = dA / pClassPriorProb [i];
}
}

// 3.2 the prior probability of class
for (int i=0; i<iClassNum; i++)
pClassPriorProb [i] /= iTotalDocNum;

// 3.3 the prior probability of feature
for (int i=0; i<iFeaTypeNum; i++)
pFeaItemPriorProb [i] /= iTotalDocNum;

// 4. feature selection (这个函数下一篇文章再详细讲)
FeaSelByChiSquare (ppChiMatrix, ppPProbMatrix, iClassNum,
iFeaTypeNum, iFeaExtractNum, pFeaSelected);

// 5. dump the model into txt file

if (bCompactModel) // output the parameters only for predicting
{
// 5.1 the prior probability of class
out << iClassNum << endl;
for (int i=0; i<iClassNum; i++)
{
out << pClassPriorProb [i] << "\n";
}
// 5.2 the actual selected feature type number
int iActualFeaNum = 0;
for (int j=0; j<iFeaTypeNum; j++)
{
if (1 == pFeaSelected[j])
iActualFeaNum ++;
}
out << iActualFeaNum << endl;
// 5.3 the post probability
for (int i=0; i<iClassNum; i++)
{
for (int j=0; j<iFeaTypeNum; j++)
{
if (1 == pFeaSelected[j])
{
out << j << ":" << ppPProbMatrix[i][j] << "\n";
}
}
}
}
else // output the full information
{
// 5.1 the total number of document
out << iTotalDocNum << endl;

// 5.2 the prior probability of class
out << iClassNum << endl;
for (int i=0; i<iClassNum; i++) // classindex:priorprob
{
out << i << ":" << pClassPriorProb [i] << "\n";
}

// 5.3 the prior probability of feature type: this is NO used in bayes model, record this for more info
// and whether this feature is selected or not by any class
out << iFeaTypeNum << "\n";
for (int i=0; i<iFeaTypeNum; i++) // featureId:priorprob:selected or not
{
out << i << ":" << pFeaItemPriorProb[i] << ":" << pFeaSelected << "\n";
}

// 5.4 the chi-square value for class-feature pair
for (int i=0; i<iClassNum; i++)
{
for (int j=0; j<iFeaTypeNum; j++)
{
out << ppChiMatrix[i][j] << "\n";
}
}

// 5.5 the post probability
for (int i=0; i<iClassNum; i++)
{
for (int j=0; j<iFeaTypeNum; j++)
{
out << ppPProbMatrix[i][j] << "\n";
}
}
}

// last, release the memory
delete [] pClassPriorProb;
delete [] pFeaItemPriorProb;
for (int i=0; i<iClassNum; i++)
{
delete [] ppChiMatrix[i];
}
delete [] ppChiMatrix;
for (int i=0; i<iClassNum; i++)
{
delete [] ppPProbMatrix[i];
}
delete [] ppPProbMatrix;
delete [] pFeaSelected;

return true;
}

最新文章

  1. kafka 安装出现的几个问题
  2. 在laravel下關於blade模板的嘗試
  3. struct和typedef struct用法
  4. Telerik XML 数据源绑定的问题
  5. iOS验证码倒计时(GCD实现)
  6. iOS开发——高级技术&amp;生成二维码
  7. git的使用方法总结
  8. VMware上实现LVS负载均衡(NAT)
  9. NODE.JS安装配置
  10. oracle error info
  11. C#中byte[] 与指针
  12. 新概念英语(1-119)who call out to the thieves in the dark?
  13. WEB通知和React Native之即时通讯(iOS Android)
  14. sjms-3 结构型模式
  15. 「UVA10766」Organising the Organisation(生成树计数)
  16. Linux常用命令——文件搜索命令
  17. 版本管理工具Git(3)VS2013下如何使用git
  18. 七牛云java(服务端)通用工具类
  19. orcl 中upper()和lower()和initcap()的用法
  20. ubuntu and centos各种上网代理设置

热门文章

  1. Sunisoft.IrisSkin.SkinEngine 设置winform皮肤
  2. 关于 Chrome Console 查看DOM详情细节的奇思淫巧
  3. 【JEECG_3.7.1】列表多表头的设计
  4. Excel导入的时候日期格式会变成double式的String数据处理
  5. Linux操作系统CentOS7.2发行版本的安装与配置(安装是选择服务器类型)
  6. lftp查看文件时间与登录服务查看文件时间相差8小时
  7. Cocos2d-x Scene生命周期 pushScene和replaceScene
  8. 第九章 搭建Hadoop 2.2.0版本HDFS的HA配置
  9. vue使用sweetalert2弹窗插件
  10. poj 50道dp题