协同过滤源码路径:

~/project/javaproject/mahout-0.9/core/src $tree main/java/org/apache/mahout/cf/taste/ -L 2

main/java/org/apache/mahout/cf/taste/

├── common

│   ├── NoSuchItemException.java

│   ├── NoSuchUserException.java

│   ├── Refreshable.java

│   ├── TasteException.java

│   └── Weighting.java

├── eval

│   ├── DataModelBuilder.java

│   ├── IRStatistics.java

│   ├── RecommenderBuilder.java

│   ├── RecommenderEvaluator.java

│   ├── RecommenderIRStatsEvaluator.java

│   └── RelevantItemsDataSplitter.java

├── hadoop

│   ├── EntityEntityWritable.java

│   ├── EntityPrefWritable.java

│   ├── MutableRecommendedItem.java

│   ├── RecommendedItemsWritable.java

│   ├── TasteHadoopUtils.java

│   ├── ToEntityPrefsMapper.java

│   ├── ToItemPrefsMapper.java

│   ├── TopItemsQueue.java

│   ├── als

│   ├── item

│   ├── preparation

│   └── similarity

├── impl

│   ├── common

│   ├── eval

│   ├── model

│   ├── neighborhood

│   ├── recommender

│   └── similarity

├── model

│   ├── DataModel.java

│   ├── IDMigrator.java

│   ├── JDBCDataModel.java

│   ├── Preference.java

│   ├── PreferenceArray.java

│   └── UpdatableIDMigrator.java

├── neighborhood

│   └── UserNeighborhood.java

├── recommender

│   ├── CandidateItemsStrategy.java

│   ├── IDRescorer.java

│   ├── ItemBasedRecommender.java

│   ├── MostSimilarItemsCandidateItemsStrategy.java

│   ├── RecommendedItem.java

│   ├── Recommender.java

│   ├── Rescorer.java

│   └── UserBasedRecommender.java

└── similarity

├── ItemSimilarity.java

├── PreferenceInferrer.java

├── UserSimilarity.java

└── precompute

similarity  相似度的interface定义

recommender 推荐算法的interface定义

model  数据model类型的interface定义

impl 目录 则是以上interface定义的实现

PearsonCorrelationSimilarity的实现在

~/mahout-core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java

/**
* @throws IllegalArgumentException if {@link DataModel} does not have preference values
*/
public PearsonCorrelationSimilarity(DataModel dataModel, Weighting weighting) throws TasteException {
//这里CenterData传的时true
/* pearson其实做的事情就是先把两个向量都减去他们的平均值,然后再计算cosine值。
* 在 AbstractSimilarity里的实现代码如下:
* double result;
if (centerData) {
double meanX = sumX / count;
double meanY = sumY / count;
// double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY;
double centeredSumXY = sumXY - meanY * sumX;
// double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX;
double centeredSumX2 = sumX2 - meanX * sumX;
// double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY;
double centeredSumY2 = sumY2 - meanY * sumY;
result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2);

} else {
result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2);
}
*/
super(dataModel, weighting, true);
Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values");
} @Override
double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) {
if (n == 0) {
return Double.NaN;
}
// Note that sum of X and sum of Y don't appear here since they are assumed to be 0;
// the data is assumed to be centered.
double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2);
if (denominator == 0.0) {
// One or both parties has -all- the same ratings;
// can't really say much similarity under this measure
return Double.NaN;
}
return sumXY / denominator;
}

就是数学公式的实现:

具体的累加,在interface里面已经做了,:

@Override
public double userSimilarity(long userID1, long userID2) throws TasteException {
DataModel dataModel = getDataModel();
  //获取用户偏好
PreferenceArray xPrefs = dataModel.getPreferencesFromUser(userID1);
PreferenceArray yPrefs = dataModel.getPreferencesFromUser(userID2);
int xLength = xPrefs.length();
int yLength = yPrefs.length(); if (xLength == 0 || yLength == 0) {
return Double.NaN;
} long xIndex = xPrefs.getItemID(0);
long yIndex = yPrefs.getItemID(0);
int xPrefIndex = 0;
int yPrefIndex = 0; double sumX = 0.0;
double sumX2 = 0.0;
double sumY = 0.0;
double sumY2 = 0.0;
double sumXY = 0.0;
double sumXYdiff2 = 0.0;
int count = 0; boolean hasInferrer = inferrer != null; while (true) {
int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0;
if (hasInferrer || compare == 0) {
double x;
double y;
if (xIndex == yIndex) {
// Both users expressed a preference for the item
x = xPrefs.getValue(xPrefIndex);
y = yPrefs.getValue(yPrefIndex);
} else {
//如果不存在对应的分数,则进行推断...
// Only one user expressed a preference, but infer the other one's preference and tally
// as if the other user expressed that preference
if (compare < 0) {
// X has a value; infer Y's
x = xPrefs.getValue(xPrefIndex);
y = inferrer.inferPreference(userID2, xIndex);
} else {
// compare > 0
// Y has a value; infer X's
x = inferrer.inferPreference(userID1, yIndex);
y = yPrefs.getValue(yPrefIndex);
}
}
sumXY += x * y;
sumX += x;
sumX2 += x * x;
sumY += y;
sumY2 += y * y;
double diff = x - y;
sumXYdiff2 += diff * diff;
count++;
}
if (compare <= 0) {
if (++xPrefIndex >= xLength) {
if (hasInferrer) {
// Must count other Ys; pretend next X is far away
if (yIndex == Long.MAX_VALUE) {
// ... but stop if both are done!
break;
}
xIndex = Long.MAX_VALUE;
} else {
break;
}
} else {
xIndex = xPrefs.getItemID(xPrefIndex);
}
}
if (compare >= 0) {
if (++yPrefIndex >= yLength) {
if (hasInferrer) {
// Must count other Xs; pretend next Y is far away
if (xIndex == Long.MAX_VALUE) {
// ... but stop if both are done!
break;
}
yIndex = Long.MAX_VALUE;
} else {
break;
}
} else {
yIndex = yPrefs.getItemID(yPrefIndex);
}
}
} // "Center" the data. If my math is correct, this'll do it.
double result;
if (centerData) {
double meanX = sumX / count;
double meanY = sumY / count;
// double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY;
double centeredSumXY = sumXY - meanY * sumX;
// double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX;
double centeredSumX2 = sumX2 - meanX * sumX;
// double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY;
double centeredSumY2 = sumY2 - meanY * sumY;
result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2);
} else {
result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2);
} if (!Double.isNaN(result)) {
result = normalizeWeightResult(result, count, cachedNumItems);
}
return result;
}

参考:

http://blog.csdn.net/v_july_v/article/details/7184318

http://blog.sina.com.cn/s/blog_73de143c010153vp.html

最新文章

  1. 响应式web网站设计制作方法
  2. maven 环境的配置 JAVA_HOME not found in your envirnment
  3. Mac OS 下的解压缩软件——The Unarchiver
  4. s3c2440 移值u-boot-2016.03 第3篇 支持Nor flash 识别
  5. cocos2d-x 3.6版连连看载入资源
  6. golang高级部分
  7. 使用oracle数据库,多用户同时对一个表进行增加,删除,修改,查看等操作,会不会有影响?
  8. openCV(四)---Canny边缘检测
  9. Doctype 文档类型,标准模式,混杂模式
  10. Django 分类标签查找
  11. [Codeforces 448C]Painting Fence
  12. 常用的几条sql语句
  13. node.js 调试问题
  14. MySQL的安装与维护
  15. Jquery 清除空白字符
  16. Keystone API
  17. fbx模型动画提取教程附带一个用代码提取的方法
  18. 20155334 实验三 敏捷开发与XP实践
  19. 前端神器!!gulp livereload实现浏览器自动刷新
  20. Laravel 控制器 Controller 传值到 视图 View 的几种方法总结

热门文章

  1. 浅析StackTrace
  2. [usb]usb otg和host
  3. LT和ET模式
  4. 关于js中__proto__和prototype的一些理解&lt;转&gt;
  5. signal基础
  6. RabbitMQ之任务队列【译】
  7. TensorFlow基础笔记(0) 参考资源学习文档
  8. php -- 用文本来存储内容,file_put_contents,serialize,unserialize
  9. 【BZOJ】1622: [Usaco2008 Open]Word Power 名字的能量(dp/-模拟)
  10. mysql插入多行数据