训练语料格式

自定义五个类别及其标签:0 运费、1 寄件、2 人工、3 改单、4 催单、5 其他业务类。 
从原数据中挑选一部分作为训练语料和测试语料 

建立模型测试并保存

import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{HashingTF, IDF, LabeledPoint, Tokenizer}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.Row
import org.apache.spark.{SparkConf, SparkContext} object shunfeng { case class RawDataRecord(label: String, text: String) def main(args : Array[String]) { val config = new SparkConf().setAppName("createModel").setMaster("local[4]")
val sc =new SparkContext(config)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
//开启RDD隐式转换,利用.toDF方法自动将RDD转换成DataFrame;
import sqlContext.implicits._ val TrainDf = sc.textFile("E:\\train.txt").map {
x =>
val data = x.split("\t")
RawDataRecord(data(0),data(1))
}.toDF()
val TestDf= sc.textFile("E:\\test.txt").map {
x =>
val data = x.split("\t")
RawDataRecord(data(0),data(1))
}.toDF()
//tokenizer分解器,把句子划分为词语
val TrainTokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
val TrainWords = TrainTokenizer.transform(TrainDf)
val TestTokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
val TestWords = TestTokenizer.transform(TestDf)
//特征抽取,利用TF-IDF
val TrainHashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(5000)
val TrainData = TrainHashingTF.transform(TrainWords)
val TestHashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(5000)
val TestData = TestHashingTF.transform(TestWords) val TrainIdf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val TrainIdfmodel = TrainIdf.fit(TrainData)
val TrainForm = TrainIdfmodel.transform(TrainData)
val TestIdf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val TestIdfModel = TestIdf.fit(TestData)
val TestForm = TestIdfModel.transform(TestData)
//把数据转换成朴素贝叶斯格式
val TrainDF = TrainForm.select($"label",$"features").map {
case Row(label: String, features: Vector) =>
LabeledPoint(label.toDouble, Vectors.dense(features.toArray))
}
val TestDF = TestForm.select($"label",$"features").map {
case Row(label: String, features: Vector) =>
LabeledPoint(label.toDouble, Vectors.dense(features.toArray))
}
//建立模型
val model =new NaiveBayes().fit(TrainDF)
val predictions = model.transform(TestDF)
predictions.show()
//评估模型
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val accuracy = evaluator.evaluate(predictions)
println("准确率:"+accuracy)
//保存模型
model.write.overwrite().save("model")
}
}

模型评估: 
 

使用模型预测

import org.ansj.recognition.impl.StopRecognition
import org.ansj.splitWord.analysis.{DicAnalysis, ToAnalysis}
import org.apache.spark.ml.classification.NaiveBayesModel
import org.apache.spark.ml.feature._
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext} object stest {
case class RawDataRecord(label: String)
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[4]").setAppName("shunfeng")
val sc = new SparkContext(conf)
val spark = SparkSession.builder().config(conf).getOrCreate()
import spark.implicits._
val frdd = sc.textFile("C:\\Users\\Administrator\\Desktop\\01\\*")
val filter = new StopRecognition()
filter.insertStopNatures("w") //过滤掉标点
val rdd = frdd.filter(_.contains("含中文"))
.filter(!_.contains("▃▂▁机器人丰小满使用指引▁▂▃"))
.map(_.split("含中文")(0))
.map(_.split("\\|")(3))
.filter(_.length>1)
.map{x =>
val temp = ToAnalysis.parse(x.toString)
RawDataRecord(DicAnalysis.parse(x.toString).recognition(filter).toStringWithOutNature(" "))
}.toDF() val tokenizer = new Tokenizer().setInputCol("label").setOutputCol("words")
val wordsData = tokenizer.transform(rdd) //setNumFeatures的值越大精度越高,开销也越大
val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(5000)
val PreData = hashingTF.transform(wordsData) val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(PreData)
val PreModel = idfModel.transform(PreData)
//加载模型
val model =NaiveBayesModel.load("model")
model.transform(PreModel).select("words","prediction").show()
}
}

结果:

最新文章

  1. SQL Server 解读【已分区索引的特殊指导原则】(1)- 索引对齐
  2. js中cookie的添加,删除,查询总结
  3. Tcp/IP
  4. linux病毒
  5. ArcGIS AddIN之工具不可用
  6. 第n小的质数
  7. 趣味C程序100.1 .2 绘制正弦曲线
  8. Oracle利用dbms_metadata.get_ddl查看DDL语句
  9. struts2表单提交的乱码的问题的解决
  10. Spring Bean的作用域(转)
  11. Kali
  12. Android开发 去掉标题栏方法 摘记
  13. 字符串匹配KMP算法的C语言实现
  14. Center OS 7 安装 $$
  15. asp.net网站服务器搭建之从零开始
  16. dgango中admin下添加搜索功能
  17. C#设计模式--工厂方法模式
  18. nw.js node-webkit系列(15)如何使用内部模块和第三方模块进行开发
  19. linux磁盘用满的两种情况
  20. apache软件no_ssl和openssl两种类型的区别

热门文章

  1. php 网页内容抓取
  2. [转]iMPACT Spartan-6 FPGA - "WARNING:iMPACT:2217-Error shows in the status register, CRC Error bit is Not 0"
  3. 组件:参数验证props:组件参数验证语法
  4. android服务的bindService/startService
  5. WIN10 Internet协议版本tcp/ipv4属性灰色没有法点击
  6. Python学习之字典--三级菜单
  7. Spring.之.报错:Caused by: java.lang.IllegalArgumentException: No Spring Session store is configured: set the 'spring.session.store-type' property
  8. HDFS FsImage文件
  9. CF549G Happy Line
  10. fore end common url