k-NN 没有特别的训练过程,给定训练集,标签,k,计算待预测特征到训练集的所有距离,选取前k个距离最小的训练集,k个中标签最多的为预测标签

约会类型分类、手写数字识别分类

  1. 计算输入数据到每一个训练数据的距离
  2. 选择前k个,判断其中类别最多的类作为预测类
import numpy as np
import operator
import matplotlib
import matplotlib.pyplot as plt # inX: test data, N features (1xN)
# dataSet: M samples, N features (MxN)
# label: for M samples (1xM)
# k: k-Nearest Neighbor
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
distances = np.sum(diffMat**2, axis=1)**0.5
sortDistances = distances.argsort() # 计算距离
classCount = {}
for i in range(k):
voteLable = labels[sortDistances[i]]
classCount[voteLable] = classCount.get(voteLable, 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) # 找出最多投票的类
result = sortedClassCount[0][0]
# print("Predict: ", result)
return result # 将一个文件写入矩阵,文件有4列,最后一列为labels,以\t间隔
def file2matrix(filename):
with open(filename) as f:
arrayLines = f.readlines()
# print(arrayLines) # 有\n
numberOfLines = len(arrayLines) # 将txt文件按行读入为一个list,一行为一个元素
returnMat = np.zeros((numberOfLines, 3))
classLabelVector = []
index = 0
for line in arrayLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat, classLabelVector # 画一些图
def ex3():
datingDateMat, datingLables = file2matrix("datingTestSet2.txt")
fig = plt.figure()
ax = fig.add_subplot(1,2,1)
ax.scatter(datingDateMat[:,1], datingDateMat[:,2], s=15.0*np.array(datingLables), c=15.0*np.array(datingLables))
ax2 = fig.add_subplot(1,2,2)
ax2.scatter(datingDateMat[:,0], datingDateMat[:,1], s=15.0*np.array(datingLables), c=15.0*np.array(datingLables))
plt.show() # 将数据集归一化[0 1]之间 (value - min)/(max - min)
def autoNorm(dataSet):
minVals = dataSet.min(axis=0)
maxVals = dataSet.max(axis=0)
ranges = maxVals - minVals
m = dataSet.shape[0]
normDataSet = dataSet - np.tile(minVals, (m,1))
normDataSet = normDataSet/np.tile(ranges, (m,1))
return normDataSet, ranges, minVals # 分类器,输入数据集,归一化参数,labels,70%作为训练集,30%测试集
def datingClassTest(normDataSet, ranges, minVals, labels):
m = normDataSet.shape[0]
numOfTrain = int(m*0.7)
trainIndex = np.arange(m)
np.random.shuffle(trainIndex)
dataSet = normDataSet[trainIndex[0:numOfTrain],:]
testSet = normDataSet[trainIndex[numOfTrain:],:]
labels = np.array(labels)
dataSetLabels = labels[trainIndex[0:numOfTrain]]
testSetLabels = labels[trainIndex[numOfTrain:]] k = int(input("Input k: "))
results = []
for inX in testSet:
result = classify0(inX, dataSet, dataSetLabels, k)
results.append(result)
compResultsAndLable = np.argwhere(results==testSetLabels)
acc = len(compResultsAndLable)/len(testSetLabels)
print("Accuracy: {:.2f}".format(acc))
print("Error: {:.2f}".format(1-acc)) classList = ['not at all', 'in small doses', 'in large doses']
inX1 = float(input("1: percentage of time spent playing video games? "))
inX2 = float(input("2: frequent flier miles earned per year? "))
inX3 = float(input("3: liters of ice cream consumed per year? "))
inXUser = [inX1,inX2,inX3]
inXUser = (inXUser - minVals)/ranges
result = classify0(inXUser, dataSet, dataSetLabels, k)
print("Predict: ", classList[result]) if __name__ == '__main__':
# # -- ex1 --
# inX = [1, 1]
# dataSet = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
# labels = ['A', 'A', 'B', 'B']
# k = 3
# classify0(inX, dataSet, labels, k) # # -- ex2 --
datingDateMat, datingLables = file2matrix("datingTestSet2.txt") # # -- ex3 --
# ex3() # #-- ex4 --
# normDataSet, ranges, minVals = autoNorm(datingDateMat) # # -- ex5 --
# datingClassTest(normDataSet, ranges, minVals, datingLables)
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os
import operator def img2vector(filename):
with open(filename) as f:
lines = f.readlines()
return_vector = []
for line in lines:
line = line.strip()
for j in line:
return_vector.append(int(j))
return return_vector # inX: test data, N features (1xN)
# dataSet: M samples, N features (MxN)
# label: for M samples (1xM)
# k: k-Nearest Neighbor
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
distances = np.sum(diffMat**2, axis=1)**0.5
sortDistances = distances.argsort() # 计算距离
classCount = {}
for i in range(k):
voteLable = labels[sortDistances[i]]
classCount[voteLable] = classCount.get(voteLable, 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) # 找出最多投票的类
result = sortedClassCount[0][0]
# print("Predict: ", result)
return result def handwriting_class_test(data_set, training_labels, test_set, test_labels, k):
results = []
for i in range(len(test_set)):
result = classify0(test_set[i], data_set, training_labels, k)
results.append(result)
# print('predict: ', result, 'answer: ', test_labels[i])
compare_results = np.argwhere(results==test_labels)
acc = len(compare_results)/len(test_labels)
print("Accuracy: {:.5f}".format(acc))
print("Error: {:.5f}".format(1-acc)) if __name__ == '__main__':
dir_path = r'H:\ML\MachineLearninginAction\02kNN\digits'
training_path = os.path.join(dir_path, r'trainingDigits')
test_path = os.path.join(dir_path, r'testDigits') training_files_list = os.listdir(training_path)
test_files_list = os.listdir(test_path) # 计算训练集矩阵与labels
m = len(training_files_list)
# m = 5
data_set = np.zeros((m, 1024))
training_labels = np.zeros(m)
for i in range(m):
data_set[i] = img2vector(os.path.join(training_path, training_files_list[i]))
training_labels[i] = training_files_list[i].split('_')[0]
# 测试集矩阵与labels
mt = len(test_files_list)
test_set = np.zeros((mt,1024))
test_labels = np.zeros(mt)
for i in range(mt):
test_set[i] = img2vector(os.path.join(test_path, test_files_list[i]))
test_labels[i] = test_files_list[i].split('_')[0]
k = 3
handwriting_class_test(data_set, training_labels, test_set, test_labels, k)

最新文章

  1. Linux 虚拟机网络适配器从E1000改为VMXNET3
  2. 关于phpcms中mysql和mysqli的区别
  3. 内存中 OLTP - 常见的工作负荷模式和迁移注意事项(二)
  4. 【转】Unity3D研究院之通过C#使用Advanced CSharp Messenger(五十)
  5. java中的浅拷贝与深拷贝
  6. Chapter 4 持久存储数据对象
  7. 关于Apache+PHP+MySQL的安装
  8. keil MDK编译器(V4.01)与H-JTAG的问题
  9. myeclipse 2014 customize_Perspective 失效解决方法-有效
  10. [POI2005]DWU-Double-row
  11. js 刷新
  12. 2017.11.18 手把手教你学51单片机-点亮LED
  13. 【NOIP 2016】Day1 T2 天天爱跑步
  14. MySQL relay_log_purge=0 时的风险
  15. Java编程思想学习笔记——注解
  16. RadioButtonFor值为false.默认选中的问题
  17. 样式缩写——css技巧(一)
  18. ypbind启动失败
  19. 各种背包的dp刷题板
  20. Codeforces Round #409

热门文章

  1. IoC容器-Bean管理XML方式(注入空值和特殊符号)
  2. linux下查看 SELinux状态及关闭SELinux
  3. 人口信息普查系统-JavaWeb-四
  4. c#代码设计:子类和父类
  5. 使用estimatedRowHeight的优缺点
  6. JAVA多线程学习七-线程池
  7. Cause: com.mysql.jdbc.exceptions.jdbc4.CommunicationsException: Communications link failure 解决
  8. 什么是Segue
  9. TableView载入WebView的一些小技巧 By 徐
  10. 【CF712E】Memory and Casinos(数学 期望 DP)