Parallel Decision Tree
2024-10-16 22:47:16
Decision Tree such as C4.5 is easy to parallel. Following is an example.
This is a non-parallel version:
public void learnFromDataSet(Iterable<Sample<FK, FV, Boolean>> dataset){
for(Sample sample : dataset){
model.addSample((MapBasedBinarySample<FK, FV>)sample);
}
Queue<TreeNode<FK, FV>> Q = new LinkedList<TreeNode<FK, FV>>();
TreeNode<FK, FV> root = model.selectRootTreeNode();
model.addTreeNode(root);
Q.add(root);
while (!Q.isEmpty()){
TreeNode v = Q.poll();
if(v.getDepth() >= model.getMaxDepth()){
continue;
}
FeatureSplit<FK> featureSplit = model.selectFeature(v);
if(featureSplit.getFeatureId() == null){
continue;
}
v.setFeatureSplit(featureSplit);
Pair<TreeNode<FK,FV>, TreeNode<FK, FV>> children =
model.newTreeNode(v, featureSplit);
TreeNode leftNode = children.getKey();
TreeNode rightNode = children.getValue();
if(leftNode != null
&& leftNode.getSampleSize() > model.getMinSampleSizeInNode()){
v.setLeft(leftNode);
model.addTreeNode(leftNode);
Q.add(leftNode);
}
if(rightNode != null
&& rightNode.getSampleSize() > model.getMinSampleSizeInNode()){
v.setRight(rightNode);
model.addTreeNode(rightNode);
Q.add(rightNode);
}
}
}
And this is a parallel version:
public class NodeSplitThread implements Runnable{
private TreeNode<FK, FV> node = null;
private Queue<TreeNode<FK, FV>> Q = null; public NodeSplitThread(TreeNode<FK, FV> node, Queue<TreeNode<FK, FV>> Q){
this.node = node;
this.Q = Q;
}
@Override
public void run() {
if(node.getDepth() >= model.getMaxDepth()){
return;
}
FeatureSplit<FK> featureSplit = model.selectFeature(node);
if(featureSplit.getFeatureId() == null){
return;
}
node.setFeatureSplit(featureSplit);
Pair<TreeNode<FK,FV>, TreeNode<FK, FV>> children = model.newTreeNode(node, featureSplit);
TreeNode<FK, FV> leftNode = children.getKey();
TreeNode<FK, FV> rightNode = children.getValue(); if(leftNode != null && leftNode.getSampleSize() > model.getMinSampleSizeInNode()){
node.setLeft(leftNode);
model.addTreeNode(leftNode);
Q.add(leftNode);
}
if(rightNode != null && rightNode.getSampleSize() > model.getMinSampleSizeInNode()){
node.setRight(rightNode);
model.addTreeNode(rightNode);
Q.add(rightNode);
}
}
} public List<TreeNode<FK, FV>> pollTopN(Queue<TreeNode<FK, FV>> Q, int n){
List<TreeNode<FK, FV>> ret = new ArrayList<TreeNode<FK, FV>>();
for(int i = 0; i < n; ++i){
if(Q.isEmpty()) break;
TreeNode<FK, FV> node = Q.poll();
ret.add(node);
}
return ret;
} @Override
public void learnFromDataSet(Iterable<Sample<FK, FV, Boolean>> dataset){ for(Sample sample : dataset){
model.addSample((MapBasedBinarySample<FK, FV>)sample);
}
Queue<TreeNode<FK, FV>> Q = new ConcurrentLinkedQueue<TreeNode<FK, FV>>();
TreeNode<FK, FV> root = model.selectRootTreeNode();
model.addTreeNode(root);
Q.add(root);
ExecutorService threadPool = Executors.newFixedThreadPool(10);
while (!Q.isEmpty()){
List<TreeNode<FK, FV>> nodes = pollTopN(Q, 10);
List<Future> tasks = new ArrayList<Future>(nodes.size());
for(TreeNode<FK, FV> node : nodes){
Future task = threadPool.submit(new NodeSplitThread(node, Q));
tasks.add(task);
}
for(Future task : tasks){
try {
task.get();
} catch (InterruptedException e) {
continue;
} catch (ExecutionException e) {
continue;
}
}
}
threadPool.shutdown();
try {
threadPool.awaitTermination(60, TimeUnit.SECONDS);
} catch (InterruptedException e) {
threadPool.shutdownNow();
Thread.interrupted();
}
threadPool.shutdownNow();
}
http://xlvector.net/blog/?p=896
最新文章
- 广州PostgreSQL用户会技术交流会小记 2015-9-19
- VMware Player安装Debian系统
- Delphi 中记录类型 给记录指针赋值。
- python学习之——selenium元素定位
- python中的Iterable, Iterator,生成器概念
- SSH入门简单搭建例子
- webstorm常用快捷键(常用)
- PHP定义静态方法的原则
- IE中float元素如果同时设置了margin值,此时margin的值会变为双倍的解决方法
- 2.如何使用matlab拟合曲线
- CentOS 6.5 中安装 Mysql 5.6,并远程连接Mysql
- bugku web web3
- python集合(set)的运算
- SSH(Spring+Struts2+Hibernate) of mappings(SSH三大框架的映射问题)
- nodeJs的Buffer操作
- ubuntu16.04+caffe+GPU+cuda+cudnn安装教程
- python pandas使用数据透视表
- Vue + Element UI 实现权限管理系统(更换皮肤主题)
- Codeforces805 A. Fake NP 2017-05-05 08:30 327人阅读 评论(0) 收藏
- 浅谈NodeJs的模块机制