功能:接收来自kafka的数据,数据是一篇文章,来判断文章的类型,把判断的结果一并保存到Hbase,并把文章建立索引(没有代码只有一个空壳,可以自己实现,以后有机会了可能会补上)

import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.feature.{HashingTF, IDF, LabeledPoint, Tokenizer}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.api.java.JavaPairReceiverInputDStream
import org.apache.spark.SparkConf
import org.apache.spark.streaming.api.java.JavaPairReceiverInputDStream.fromReceiverInputDStream
import org.apache.spark.rdd.RDD
import org.apache.spark.ml.classification.NaiveBayesModel
import org.omg.CORBA_2_3.portable.OutputStream
import java.io.FileOutputStream class UseModel1 { }
object UseModel1{
//流程代码
def main(args: Array[String]): Unit = {
val Array(zkQuorum, group, topics, numThreads) =Array("192.168.10.199:2181","order","order","");
val conf = new SparkConf().setAppName("useModel").setMaster("local[4]");
val ssc = getStreamingContext(conf, );
val dstreams = getKafkaDstream(ssc, topics, zkQuorum, group, numThreads);
val dstream = dstreams.inputDStream.map(_._2);
dstream.persist()
//测试
dstream.print()
//如果能判断不为空就更好了
dstream.foreachRDD(rdd =>everyRDD(rdd))
ssc.start()
ssc.awaitTermination()
} //得到StreamingContext
def getStreamingContext(conf:SparkConf,secend:Int):StreamingContext = {
return new StreamingContext(conf, Seconds(secend))
} //得到sparkSession
def getSparkSession(conf:SparkConf): SparkSession = {
val spark = SparkSession.builder()
.config(conf)
.config("spark.sql.warehouse.dir", "warehouse/dir")
.getOrCreate()
return spark;
} //得到kafkaDStream
def getKafkaDstream(ssc:StreamingContext,topics:String,zkQuorum:String,group:String,numThreads:String):JavaPairReceiverInputDStream[String,String] ={
ssc.checkpoint("directory")
val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap;
val stream = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap)
return stream;
} //文件保存测试
def savaString(str:String):Unit={
val out = new FileOutputStream("D:\\decstop\\file.txt",true);
out.write(str.getBytes)
out.flush()
out.close()
} //每一个rdd做动作
def everyRDD(rdd:RDD[String]){
val sameModel = NaiveBayesModel.load("resoult") val spark = getSparkSession(rdd.context.getConf)
import spark.implicits._
val rddDF = rdd.map { line => (,line) }.toDF("label","text").persist()
//rddDF.show()
val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
val tokenizerRDD = tokenizer.transform(rddDF)
//tokenizerRDD.show(false) val hashingTF =
new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures()
val hashingTFRDD = hashingTF.transform(tokenizerRDD) val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(hashingTFRDD)
val rescaledData = idfModel.transform(hashingTFRDD)
//rescaledData.show(false)
//转化为贝叶斯需要的格式
val useDataRdd = rescaledData.select($"label", $"features").map{
case Row(label:Int , features:Vector) =>
LabeledPoint(label.toDouble, Vectors.dense(features.toArray))
}
val predictions = sameModel.transform(useDataRdd)
predictions.persist()
//predictions.show(false)
//参照下面可以实现各种的逻辑,可以把下面的保存,建索引都加上
predictions.select($"label",$"prediction").foreach { x => savaString((""+x.getAs("label")+" "+x.getAs("prediction")+"\n\r")) } //测试
predictions.createOrReplaceTempView("prediction")
rddDF.createOrReplaceTempView("atical") //spark.sql("select p.label,p.prediction,a.text from prediction p,atical a where p.label=a.label").select(col, cols) } //简历索引 主要的建立索引的有hbase_rowKay(time) aothor title article
def buiderIndex(){} //保存到hbase
def savaToHbase(){ } //发送到下一个kafka 发送的数据 time 正舆情数量 负面舆情数量 百分比 是否报警 def sendToKafka(){ }
}

代码实现:

最新文章

  1. [css]全屏背景图片设置,django加载图片路径
  2. [LeetCode] Intersection of Two Linked Lists 求两个链表的交点
  3. IP地址,子网掩码,默认网关,DNS服务器知识详解(转)
  4. hibernate 异常:Unexpected Exception caught setting
  5. 【读书笔记】iOS-使用Web Service-基于客户端服务器结构的网络通信(一)
  6. 关于HTML5在动画制作工具Animatron的一些问题
  7. QTreeWidget
  8. MyEclipse 开发 Web项目发布到 Tomcat 下的Root 目录
  9. HDU 3835 R(N)
  10. Class文件结构
  11. 基于Node的PetShop,RESTful API以及认证
  12. MVC Action,Service中筛选int 和list<int>
  13. ubuntu修改主机名和出现问题
  14. G方法的华丽升级
  15. 关于__irq 的使用
  16. 深入理解Azure自动扩展集VMSS(3)
  17. c语言libcurl 使用实例get/post方法+c语言字符串处理
  18. windows下取linux系统里面的文件
  19. 45、SQL逻辑查询语句执行顺序
  20. C# 类型基础(上)

热门文章

  1. jQuery弹出层效果
  2. dedecms的arclist循环中判断第一个li添加css,否则不加
  3. module、applet
  4. Java精选笔记_Java编程基础
  5. strip() 、lstrip() 、rstrip()
  6. Core Location和MapKit的一些简单使用
  7. 《C++ Primer Plus》15.4 RTTI 学习笔记
  8. Size Balanced Tree
  9. LeetCode——Best Time to Buy and Sell Stock II
  10. SWT/JFace开发遇到org.eclipse.core.runtime.IProgressMonitor问题的解决办法(转载)