package Spark_MLlib

import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.linalg.{Vector, Vectors} /**
* K均值
*/
case class features_schema(features:Vector)
object 聚类__KMeans {
val spark=SparkSession.builder().master("local[2]").getOrCreate()
import spark.implicits._
def main(args: Array[String]): Unit = { val data=spark.sparkContext.textFile("file:///home/soyo/桌面/spark编程测试数据/soyo2.txt")
.map(_.split(",")).map(x=>features_schema(Vectors.dense(x().toDouble,x().toDouble,x().toDouble,x().toDouble))).toDF()
data.show()
val KMeansModel=new KMeans().setK().setFeaturesCol("features").setPredictionCol("prediction").fit(data)
val results=KMeansModel.transform(data)
results.show()
//模型所有的聚类中心(指最后生成的聚类中心,K是几就有几组)的情况
KMeansModel.clusterCenters.foreach(println)
//集合内误差平方和(选取K的大小可以参照,使用场景+最大的集合内误差平方的值=较合适的K)
val cost=KMeansModel.computeCost(data)
println(cost)
}
}

结果:

+-----------------+
|         features|
+-----------------+
|[5.1,3.5,1.4,0.2]|
|[4.9,3.0,1.4,0.2]|
|[4.7,3.2,1.3,0.2]|
|[4.6,3.1,1.5,0.2]|
|[5.0,3.6,1.4,0.2]|
|[5.4,3.9,1.7,0.4]|
|[4.6,3.4,1.4,0.3]|
|[5.0,3.4,1.5,0.2]|
|[4.4,2.9,1.4,0.2]|
|[4.9,3.1,1.5,0.1]|
|[5.4,3.7,1.5,0.2]|
|[4.8,3.4,1.6,0.2]|
|[4.8,3.0,1.4,0.1]|
|[4.3,3.0,1.1,0.1]|
|[5.8,4.0,1.2,0.2]|
|[5.7,4.4,1.5,0.4]|
|[5.4,3.9,1.3,0.4]|
|[5.1,3.5,1.4,0.3]|
|[5.7,3.8,1.7,0.3]|
|[5.1,3.8,1.5,0.3]|
+-----------------+
only showing top 20 rows

+-----------------+----------+
|         features|prediction|
+-----------------+----------+
|[5.1,3.5,1.4,0.2]|         0|
|[4.9,3.0,1.4,0.2]|         0|
|[4.7,3.2,1.3,0.2]|         0|
|[4.6,3.1,1.5,0.2]|         0|
|[5.0,3.6,1.4,0.2]|         0|
|[5.4,3.9,1.7,0.4]|         0|
|[4.6,3.4,1.4,0.3]|         0|
|[5.0,3.4,1.5,0.2]|         0|
|[4.4,2.9,1.4,0.2]|         0|
|[4.9,3.1,1.5,0.1]|         0|
|[5.4,3.7,1.5,0.2]|         0|
|[4.8,3.4,1.6,0.2]|         0|
|[4.8,3.0,1.4,0.1]|         0|
|[4.3,3.0,1.1,0.1]|         0|
|[5.8,4.0,1.2,0.2]|         0|
|[5.7,4.4,1.5,0.4]|         0|
|[5.4,3.9,1.3,0.4]|         0|
|[5.1,3.5,1.4,0.3]|         0|
|[5.7,3.8,1.7,0.3]|         0|
|[5.1,3.8,1.5,0.3]|         0|
+-----------------+----------+
only showing top 20 rows

[5.005999999999999,3.4180000000000006,1.4640000000000002,0.2439999999999999]
[6.8538461538461535,3.076923076923076,5.715384615384614,2.0538461538461537]
[5.883606557377049,2.740983606557377,4.388524590163936,1.4344262295081966]
78.94506582597859

最新文章

  1. Linux搭建nfs服务器
  2. 谈谈软件项目的dependency
  3. TP框架整合Swagger UI接口文档
  4. java基础之——类的初始化顺序
  5. [转]SQLServer 2008数据库查看死锁、堵塞的SQL语句
  6. ArcGIS Engine断开其他ArcSDE用户连接的解决方案
  7. [转]DataTable用中使用Compute 实现简单的DataTable数据的统计
  8. Java学习步骤
  9. tail -f
  10. PrefixSpan算法原理总结
  11. 最常用的UML工具介绍
  12. 冒泡算法给0~9随机n位数字排序
  13. vue 学习链接地址
  14. Appium+Java(三)搭建环境之踩过的坑
  15. repo常用命令及常见问题汇总
  16. 三、fgetc与fputc
  17. ES6 Generator async
  18. java-信息安全(一)-BASE64,MD5,SHA,HMAC,RIPEMD算法
  19. MVC ---- DBHelper.ttinclude
  20. linux-网络使用

热门文章

  1. 集训第四周(高效算法设计)I题 (贪心)
  2. ROS 笔记 程序包/节点/topic
  3. Leetcode 149.直线上最多的点数
  4. 微信开放平台PC端扫码登录功能个人总结
  5. Jackson 字符串转List<Map>
  6. Codeforces 777E(离散化+dp+树状数组或线段树维护最大值)
  7. Ubuntu 16.04下操作iptables的技巧(解决Failed to start iptables.service: Unit iptables.service not found.或者/etc/init.d/iptables: 没有那个文件或目录)
  8. select语句中会影响查询效率的因素
  9. ovs ml2
  10. paramiko错误信息:Paramiko error: size mismatch in put