首先当然要配置r语言环境变量什么的

D:\R-3.5.1\bin\x64;
D:\R-3.5.1\bin\x64\R.dll;
D:\R-3.5.1;
D:\ProgramData\Anaconda3\Lib\site-packages\rpy2;

本来用python也可以实现关联规则,虽然没包,但是可视化挺麻烦的

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from pandas import read_csv def loadDataSet():
dataset = read_csv("F:/goverment/Aprior/No Number.csv")
data = dataset.values[:,:]
Data=[]
for line in data:
ls=[]
for i in line:
ls.append(i)
Data.append(ls)
#print(Data)
return Data '''
return [['a', 'c', 'e'], ['b', 'd'], ['b', 'c'], ['a', 'b', 'c', 'd'], ['a', 'b'], ['b', 'c'], ['a', 'b'],
['a', 'b', 'c', 'e'], ['a', 'b', 'c'], ['a', 'c', 'e']]''' def createC1(dataSet):
C1 = []
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
'''??????????????????????????????????????????????????????'''
# 映射为frozenset唯一性的,可使用其构造字典
return list(map(frozenset, C1)) # 从候选K项集到频繁K项集(支持度计算)
def scanD(D, Ck, minSupport):
ssCnt = {}
for tid in D:
for can in Ck:
if can.issubset(tid):
if not can in ssCnt:
ssCnt[can] = 1
else:
ssCnt[can] += 1
numItems = float(len(D))
retList = []
supportData = {}
for key in ssCnt:
support = ssCnt[key] / numItems
if support >= minSupport:
retList.insert(0, key)
supportData[key] = support
return retList, supportData def calSupport(D, Ck, min_support):
dict_sup = {}
for i in D:
for j in Ck:
if j.issubset(i):
if not j in dict_sup:
dict_sup[j] = 1
else:
dict_sup[j] += 1
sumCount = float(len(D))
supportData = {}
relist = []
for i in dict_sup:
temp_sup = dict_sup[i] / sumCount
if temp_sup >= min_support:
relist.append(i)
supportData[i] = temp_sup # 此处可设置返回全部的支持度数据(或者频繁项集的支持度数据)
return relist, supportData # 改进剪枝算法
def aprioriGen(Lk, k): # 创建候选K项集 ##LK为频繁K项集
retList = []
lenLk = len(Lk)
for i in range(lenLk):
for j in range(i + 1, lenLk):
L1 = list(Lk[i])[:k - 2]
L2 = list(Lk[j])[:k - 2]
L1.sort()
L2.sort()
if L1 == L2: # 前k-1项相等,则可相乘,这样可防止重复项出现
# 进行剪枝(a1为k项集中的一个元素,b为它的所有k-1项子集)
a = Lk[i] | Lk[j] # a为frozenset()集合
a1 = list(a)
b = []
# 遍历取出每一个元素,转换为set,依次从a1中剔除该元素,并加入到b中
for q in range(len(a1)):
t = [a1[q]]
tt = frozenset(set(a1) - set(t))
b.append(tt)
t = 0
for w in b:
# 当b(即所有k-1项子集)都是Lk(频繁的)的子集,则保留,否则删除。
if w in Lk:
t += 1
if t == len(b):
retList.append(b[0] | b[1])
return retList def apriori(dataSet, minSupport=0.2):
C1 = createC1(dataSet)
D = list(map(set, dataSet)) # 使用list()转换为列表
L1, supportData = calSupport(D, C1, minSupport)
L = [L1] # 加列表框,使得1项集为一个单独元素
k = 2
while (len(L[k - 2]) > 0):
Ck = aprioriGen(L[k - 2], k)
Lk, supK = scanD(D, Ck, minSupport) # scan DB to get Lk
supportData.update(supK)
L.append(Lk) # L最后一个值为空集
k += 1
del L[-1] # 删除最后一个空集
return L, supportData # L为频繁项集,为一个列表,1,2,3项集分别为一个元素。 # 生成集合的所有子集
def getSubset(fromList, toList):
for i in range(len(fromList)):
t = [fromList[i]]
tt = frozenset(set(fromList) - set(t))
if not tt in toList:
toList.append(tt)
tt = list(tt)
if len(tt) > 1:
getSubset(tt, toList) #def calcConf(freqSet, H, supportData, ruleList, minConf=0.7):
def calcConf(freqSet, H, supportData, Rule, minConf=0.7):
for conseq in H:
conf = supportData[freqSet] / supportData[freqSet - conseq] # 计算置信度
# 提升度lift计算lift = p(a & b) / p(a)*p(b)
lift = supportData[freqSet] / (supportData[conseq] * supportData[freqSet - conseq]) ls=[]
if conf >= minConf and lift > 3:
for i in freqSet - conseq:
#print(i," ",end="")
ls.append(i)
ls.append(" ")
#print('-->',end="")
ls.append('-->')
for i in conseq:
#print(i," ",end="")
ls.append(i)
ls.append(" ")
#print('支持度:', round(supportData[freqSet - conseq]*100, 1), "%",' 置信度:', round(conf*100,1),"%",' lift值为', round(lift, 2))
#ls.append(' 支持度:')
#ls.append(round(supportData[freqSet - conseq]*100, 1))
#ls.append("% ")
#ls.append(' 置信度:')
ls.append( round(conf*100,1))
ls.append("% ")
#ls.append( round(lift, 2))
#ls.append(round(lift, 2)) #ruleList.append((freqSet - conseq, conseq, conf))
if ls!=[]:
#print(len(ls))
Rule.append(ls)
# =============================================================================
# for line in Rule:
# for i in line:
# print(i,end="")
# print("")
# =============================================================================
return Rule
# =============================================================================
# print(freqSet - conseq, '-->', conseq, '支持度', round(supportData[freqSet - conseq], 2), '置信度:', round(conf,3),
# 'lift值为:', round(lift, 2))
# ============================================================================= # 生成规则
def gen_rule(L, supportData, minConf=0.7):
bigRuleList = []
for i in range(1, len(L)): # 从二项集开始计算
for freqSet in L[i]: # freqSet为所有的k项集
# 求该三项集的所有非空子集,1项集,2项集,直到k-1项集,用H1表示,为list类型,里面为frozenset类型,
H1 = list(freqSet)
all_subset = []
getSubset(H1, all_subset) # 生成所有的子集
calcConf(freqSet, all_subset, supportData, bigRuleList, minConf)
return bigRuleList if __name__ == '__main__': dataSet = loadDataSet()
#print(dataSet)
L, supportData = apriori(dataSet, minSupport=0.05)
rule = gen_rule(L, supportData, minConf=0.5)
for i in rule:
for j in i:
if j==',':
continue
else:
print(j,end="")
print("") '''
具体公式: P(B|A)/P(B) 称为A条件对于B事件的提升度,如果该值=1,说明两个条件没有任何关联,
如果<1,说明A条件(或者说A事件的发生)与B事件是相斥的,
一般在数据挖掘中当提升度大于3时,我们才承认挖掘出的关联规则是有价值的。
'''

之后还是用r吧,要下载rpy2,见https://www.cnblogs.com/caiyishuai/p/9520214.html

还要下载两个R的包

import rpy2.robjects as robjects
b=('''
install.packages("arules")
install.packages("arulesViz")
''')
robjects.r(b)

然后就是主代码了

import rpy2.robjects as robjects

a=('''Encoding("UTF-8")
setwd("F:/goverment/Aprior") all_data<-read.csv("F:/goverment/Aprior/NewData.csv",header = T,#将数据转化为因子型
colClasses=c("factor","factor","factor","factor","factor","factor","factor","factor","factor","factor","factor","factor"))
library(arules)
rule=apriori(data=all_data[,c(1,4,5,6,7,8,9,10,12)], parameter = list(support=0.05,confidence=0.7,minlen=2,maxlen=10))
''')
robjects.r(a) robjects.r('''
rule.subset<-subset(rule,lift>1)
#inspect(rule.subset)
rules.sorted<-sort(rule.subset,by="lift")
subset.matrix<-is.subset(rules.sorted,rules.sorted)
lower.tri(subset.matrix,diag=T)
subset.matrix[lower.tri(subset.matrix,diag = T)]<-NA
redundant<-colSums(subset.matrix,na.rm = T)>=1 #这五条就是去冗余(感兴趣可以去网上搜),我虽然这里写了,但我没有去冗余,我的去了以后一个规则都没了
which(redundant)
rules.pruned<-rules.sorted[!redundant]
#inspect(rules.pruned) #输出去冗余后的规则
''') c=(''' library(arulesViz)#掉包 jpeg(file="plot1.jpg")
#inspect(rule.subset)
plt<-plot(rule.subset,shading = "lift")#画散点图
dev.off() subrules<-head(sort(rule.subset,by="lift"),50)
#jpeg(file="plot2.jpg")
plot(subrules,method = "graph")#画图
#dev.off() rule.sorted <- sort(rule.subset, decreasing=TRUE, by="lift") #按提升度排序
rules.write<-as(rule.sorted,"data.frame") #将规则转化为data类型
write.csv(rules.write,"F:/goverment/Aprior/NewRules.csv",fileEncoding="UTF-8")
''')
robjects.r(c) #取出保存的规则,放到一个列表中
from pandas import read_csv
data_set = read_csv("F:/goverment/Aprior/NewRules.csv")
data = data_set.values[:, :]
rul = []
for line in data:
ls = []
for j in line:
try :
j=float(j)
if j>0 and j<=1:
j=str(round(j*100,2))+"%"
ls.append(j)
else:
ls.append(round(j,2))
except:
ls.append(j)
rul.append(ls) for line in rul:
print(line)

最新文章

  1. GridView点击行触发SelectedIndexChanged事件
  2. 需要使用id内省方法--responsesToSelector: 的两个地方
  3. HDU 1565 最大点权独立集
  4. UINavgation日常小bug-有兴趣的朋友可以看看
  5. DotNet Core 之旅(一)
  6. Linux安装Team Service Agent
  7. HDU_2030——统计文本中汉字的个数
  8. struct tm-&gt;time() localtime() gmtime()
  9. mysql 命令行 自动补全
  10. 在非MFC程序中使用调试宏 ASSERT(),VERIFY()和 TRACE()
  11. MTK MOTA升级步骤
  12. Java实现二分查找算法
  13. Java之面向对象概述,类,构造方法,static,主方法,对象
  14. 常用http响应报文分析
  15. java基础部分细节
  16. Linux 进程状态 概念 Process State Definition
  17. node.js的安装的配置
  18. ZooKeeper连接并创建节点以及实现分布式锁操作节点排序输出最小节点Demo
  19. hdfs 架构
  20. Codeforces Round #541 (Div. 2) D(并查集+拓扑排序) F (并查集)

热门文章

  1. C++ 自定义错误类
  2. 微信小程序路过——新手不要错过哦!
  3. css 基础-1
  4. Command(命令)
  5. Java 数组如何转成List集合
  6. axis2 webservice jar包使用情况(转)
  7. restframework api(基础3CBV)
  8. android中string.xml中%一$s、%1$d等的用法
  9. 【LeetCode 225_数据结构_栈_实现】Implement Stack using Queues
  10. WebLogic发布S2SH应用时提示ClassNotFoundException: org.hibernate.hql.ast.HqlToken异常