# 决策树

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
import zipfile
#压缩节省空间
z=zipfile.ZipFile('ad-dataset.zip')
# df=pd.read_csv(z.open(z.namelist()[0]),header=None,low_memory=False)
# df = pd.read_csv(z.open(z.namelist()[0]), header=None, low_memory=False)
df=pd.read_csv('.\\tree_data\\ad.data',header=None)
explanatory_variable_columns=set(df.columns.values)
response_variable_column=df[len(df.columns.values)-1]
#最后一列是代表的标签类型
explanatory_variable_columns.remove(len(df.columns)-1)
y=[1 if e =='ad.' else 0 for e in response_variable_column]
X=df.loc[:,list(explanatory_variable_columns)]
#匹配?字符,并把值转化为-1
X.replace(to_replace=' *\?', value=-1, regex=True, inplace=True)
X_train,X_test,y_train,y_test=train_test_split(X,y)
#用信息增益启发式算法建立决策树
pipeline=Pipeline([('clf',DecisionTreeClassifier(criterion='entropy'))])
parameters = {
'clf__max_depth': (150, 155, 160),
'clf__min_samples_split': (1, 2, 3),
'clf__min_samples_leaf': (1, 2, 3)
}
#f1查全率和查准率的调和平均
grid_search=GridSearchCV(pipeline,parameters,n_jobs=-1,
verbose=1,scoring='f1')
grid_search.fit(X_train,y_train)
print '最佳效果:%0.3f'%grid_search.best_score_
print '最优参数'
best_parameters=grid_search.best_estimator_.get_params()
best_parameters

输出结果:

Fitting 3 folds for each of 27 candidates, totalling 81 fits
 
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done 81 out of 81 | elapsed: 34.7s finished
 
最佳效果:0.888
最优参数
Out[123]:
{'clf': DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=160,
max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
min_samples_split=3, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='best'),
'clf__class_weight': None,
'clf__criterion': 'entropy',
'clf__max_depth': 160,
'clf__max_features': None,
'clf__max_leaf_nodes': None,
'clf__min_samples_leaf': 1,
'clf__min_samples_split': 3,
'clf__min_weight_fraction_leaf': 0.0,
'clf__presort': False,
'clf__random_state': None,
'clf__splitter': 'best',
'steps': [('clf',
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=160,
max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
min_samples_split=3, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='best'))]}
for param_name in sorted(parameters.keys()):
print ('\t%s:%r'%(param_name,best_parameters[param_name]))
predictions=grid_search.predict(X_test)
print classification_report(y_test,predictions)

输出结果:

clf__max_depth:150
clf__min_samples_leaf:1
clf__min_samples_split:1
             precision    recall  f1-score   support

0       0.97      0.99      0.98       703
          1       0.91      0.84      0.87       117

avg / total       0.96      0.96      0.96       820

df.head()

输出结果;

  0 1 2 3 4 5 6 7 8 9 ... 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558
0 125 125 1.0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
1 57 468 8.2105 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
2 33 230 6.9696 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
3 60 468 7.8 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
4 60 468 7.8 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.

 # 决策树集成

#coding:utf-8
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV df=pd.read_csv('.\\tree_data\\ad.data',header=None,low_memory=False)
explanatory_variable_columns=set(df.columns.values)
response_variable_column=df[len(df.columns.values)-1]
df.head()
  0 1 2 3 4 5 6 7 8 9 ... 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558
0 125 125 1.0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
1 57 468 8.2105 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
2 33 230 6.9696 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
3 60 468 7.8 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
4 60 468 7.8 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 ad.
#The last column describes the targets(去掉最后一列)
explanatory_variable_columns.remove(len(df.columns.values)-1)
y=[1 if e=='ad.' else 0 for e in response_variable_column]
X=df.loc[:,list(explanatory_variable_columns)]
#置换有?的为-1
X.replace(to_replace=' *\?', value=-1, regex=True, inplace=True)
X_train,X_test,y_train,y_test=train_test_split(X,y)
pipeline=Pipeline([('clf',RandomForestClassifier(criterion='entropy'))])
parameters = {
'clf__n_estimators': (5, 10, 20, 50),
'clf__max_depth': (50, 150, 250),
'clf__min_samples_split': (1, 2, 3),
'clf__min_samples_leaf': (1, 2, 3)
}
grid_search = GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=1,scoring='f1')
grid_search.fit(X_train,y_train)
print(u'最佳效果:%0.3f'%grid_search.best_score_)
print u'最优的参数:'
best_parameters=grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print('\t%s:%r'%(param_name,best_parameters[param_name]))

输出结果:

最佳效果:0.929 最优的参数: clf__max_depth:250 clf__min_samples_leaf:1 clf__min_samples_split:3 clf__n_estimators:50
predictions=grid_search.predict(X_test)
print classification_report(y_test,predictions)

输出结果:

precision    recall  f1-score   support

0       0.98      1.00      0.99       705
          1       0.97      0.90      0.93       115

avg / total       0.98      0.98      0.98       820

最新文章

  1. Big Data
  2. 对数据库触发器new和old的理解
  3. 23.C#Queryable的扩展方法(十二章12.1-12.2)
  4. Headmaster's Headache
  5. IMPLEMENTED IN PYTHON +1 | CART生成树
  6. MFC中修改默认启动对话框方法
  7. Python 函数相关概念
  8. java并发编程的艺术——第五章总结(Lock锁与队列同步器)
  9. Swift tableview自带的刷新控件
  10. k8s踩坑记 - kubeadm join 之 token 失效
  11. 【一天一道LeetCode】#31. Next Permutation
  12. erlang在redhat上的安装
  13. Qt(MinGW版)在win7 64位上无法播放视频解决方案
  14. 为什么选择 Visual Studio Code
  15. Error occurred during initialization of VM Could not reserve enough space for 2097152KB object heap
  16. java+jenkins+testng+selenium+ant
  17. Ubuntu 12.04上安装HBase并运行
  18. linux操作命令 开发人员需要掌握的一些命令
  19. mybatis学习 十二 多表查询
  20. 并发编程之 SynchronousQueue 核心源码分析

热门文章

  1. unbind() 移除事件内处理方法
  2. POST请求的forHTTPHeaderField
  3. Windows:删除图标缓存
  4. 占位符 %s
  5. [微信小程序直播平台开发]___(三)Nginx-rtmp事件回调
  6. log4j.properties的配置详解
  7. iOS网络缓存的系统实现是一个烂尾工程
  8. 学习Android之SimpleAdapter显示网络图片
  9. 3226. [SDOI2008]校门外的区间【线段树】
  10. 【转】 java中Class对象详解和类名.class, class.forName(), getClass()区别