from numpy import *
import operator
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import os
from os import listdir
myfont = matplotlib.font_manager.FontProperties(
      fname='/aibi/myjupyter/02_Quant/images/simsun.ttf'
    , size = 12
)
np.set_printoptions(suppress=True, precision=2, threshold=10, linewidth=20)


def createDataSet():
    group = array([[1.0,1.1], [1.0,1.0], [0,0], [0,0.1]])
    labels = ['A', 'A', 'B', 'B']
    return group, labels


group, labels= createDataSet()


# 相当于每行是一个实例，每列是一个特征
group

array([[1. , 1.1],
       [1. , 1. ],
       [0. , 0. ],
       [0. , 0.1]])


group.shape

(4, 2)


# 相当于是一列分类（目标变量）
labels

['A', 'A', 'B', 'B']


# 使用可视化工具形象理解分类
x1 = group[:2,0]
y1 = group[:2,1]
print('x1 is: ', x1, ' and ', 'y1 is: ', y1)
x2 = group[2:,0]
y2 = group[2:,1]
print('x2 is: ', x2, ' and ', 'y2 is: ', y2)
plt.scatter(x1, y1, marker = 'o',color = 'red', s = 40 ,label = 'A')
plt.scatter(x2, y2, marker = 'o',color = 'blue', s = 40 ,label = 'B')
plt.legend(loc = 'best')
plt.show()

x1 is:  [1. 1.]  and  y1 is:  [1.1 1. ]
x2 is:  [0. 0.]  and  y2 is:  [0.  0.1]


# 输入向量 inX
# 输入训练样本集 dataSet
# 标签向量 labels
# k 表示用于选择最邻近个体的数目（KNN缺省是5个）

def classify0(inX, dataSet, labels, k):
    
    dataSetSize = dataSet.shape[0]
    
    ## 距离计算 使用 欧式距离
    # tile() 函数，将原矩阵横向、纵向地复制, 把数组铺展开来
    # 此处为纵向复制 dataSet 的第一个维度数，然后计算与 dataSet 的差
    diffMat = tile(inX, (dataSetSize, 1)) - dataSet
    # 计算差值的平方
    sqDiffMat = diffMat**2
    # 差值平方后汇总（sum）
    sqDistances = sqDiffMat.sum(axis=1)
    # 然后再计算汇总数的开平方
    distances = sqDistances**0.5
    # argsort函数返回数组值从小到大的索引值
    # 即按照 value 排序，返回对应的 key 值 的 顺序数组
    sortedDistIndicies = distances.argsort()
    
    ## 选择距离最小的 k 个点
    # 创建一个空字典
    classCount={}
    for i in range(k):
        # sortedDistIndicies[i] 表示在 distances 中，最小距离的那个索引
        # 根据这个索引，使用 labels[sortedDistIndicies[i]] 提取该实例的类别
        voteIlabel = labels[sortedDistIndicies[i]]
        # 类别计数器，以类别作为 key，累加 1
        # dict.get(key, default=None)
        # 返回指定键的值（以便实现累加）
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
        
    ## 排序
    # sorted 可以对所有可迭代的对象进行排序操作
    # sorted 方法返回的是一个新的 list，而不是在原来的基础上进行的操作
    # sorted(iterable, cmp=None, key=None, reverse=False)
    sortedClassCount = sorted(
        # 字典的 items 方法作用：将字典中的所有项，以列表方式返回。因为字典是无序的，所以用items方法返回字典的所有项，也是没有顺序的
        # 字典的 iteritems 方法作用：与items方法相比作用大致相同，只是它的返回值不是列表，而是一个迭代器
        # 在Python 3.x 里面，iteritems()方法已经废除了。在3.x里用 items()替换iteritems() ，可以用于 for 来循环遍历
          classCount.items()
        # key 用来进行比较的元素
        # operator.itemgetter(0)：classCount 得 key 值，即 voteIlabel 的值
        # operator.itemgetter(1)：计数的值，用来排序
        , key = operator.itemgetter(1)
        # 降序
        , reverse = True
    )
    # 返回算法选择的最接近的类别
    return sortedClassCount[0][0]


# 输入向量 inX = [0.7,0.5]，就是一条新的实例，返回结果就是预测的类别
classify0([0.7,0.5], group, labels, 3)

'A'


classify0([0.5,0.4], group, labels, 3)

'B'


classify0([0.4,0.3], group, labels, 3)

'B'


# 配置测试值
x3 = [0.5,0.4]
y3 = [0.4,0.3]
l = classify0([0.5,0.4], group, labels, 3)
c = 'red' if l == 'A' else 'blue'

# 使用可视化工具形象理解分类
x1 = group[:2,0]
y1 = group[:2,1]
print('x1 is: ', x1, ' and ', 'y1 is: ', y1)
x2 = group[2:,0]
y2 = group[2:,1]
print('x2 is: ', x2, ' and ', 'y2 is: ', y2)
plt.scatter(x1, y1, marker = 'o',color = 'red', s = 40 ,label = 'A')
plt.scatter(x2, y2, marker = 'o',color = 'blue', s = 40 ,label = 'B')
plt.scatter(x3, y3, marker = 'o',color = c, s = 40 ,label = l)
plt.legend(loc = 'best')
plt.show()

x1 is:  [1. 1.]  and  y1 is:  [1.1 1. ]
x2 is:  [0. 0.]  and  y2 is:  [0.  0.1]


def file2matrix(filename):
    # 打开文件
    fr = open(filename)
    # 读取每一行数据
    arrayOfLines = fr.readlines()
    # 计算总行数
    numberOfLines = len(arrayOfLines)
    # 按照总行数（实例数）创建一个0的矩阵，共3列（3个特征）
    returnMat = zeros((numberOfLines, 3))
    # 创建空列表，用来存储 目标变量（类别）
    classLabelVector = []
    # 创建一个索引
    index = 0
    for line in arrayOfLines:
        # 截取掉所有的回车符
        line = line.strip()
        # 使用 tab 字符分割每一行实例数据
        listFromLine = line.split('\t')
        # 将每一行的前 3 列 赋值给 0矩阵，使用index作为矩阵的索引（key）
        # nparry 为每一行赋值的时候，必须有一个索引号
        # 效果等同：returnMat[index] = listFromLine[0:3]
        returnMat[index,:] = listFromLine[0:3]
        # 将每一行的最后一列（目标变量）添加到列表尾部
        classLabelVector.append(int(listFromLine[-1]))
        # index自增1
        index += 1
    # 返回实例矩阵和目标变量列表    
    return returnMat, classLabelVector


datingDataMat, datingLabels = file2matrix('rawdata/datingTestSet2.txt')


datingDataMat

array([[40920.  ,
            8.33,
            0.95],
       [14488.  ,
            7.15,
            1.67],
       [26052.  ,
            1.44,
            0.81],
       ...,
       [26575.  ,
           10.65,
            0.87],
       [48111.  ,
            9.13,
            0.73],
       [43757.  ,
            7.88,
            1.33]])


datingLabels[:20]

[3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3]


import numpy
 
a = numpy.array(([3,2,1],[2,5,7],[4,7,8]))
print('a is: \n', a,'\n') 
itemindex = numpy.argwhere(a == 7)
print('itemindex is: \n', itemindex)

a is: 
 [[3 2 1]
 [2 5 7]
 [4 7 8]] 

itemindex is: 
 [[1 2]
 [2 1]]


# datingDataMat[:,1],代表：玩视频游戏所耗时间百分比
# datingDataMat[:,2]，代表：每周所消费的冰激凌公升数
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(datingDataMat[:,1], datingDataMat[:,2])
ax.set_title('数据观察', fontproperties=myfont, fontsize=18)
ax.set_xlabel('玩视频游戏所耗时间百分比', fontproperties=myfont)
ax.set_ylabel('每周所消费的冰激凌公升数', fontproperties=myfont)
plt.show()


"""
matplotlib.pyplot.scatter(
      x, y
    , s=None
    , c=None
    , marker=None
    , cmap=None
    , norm=None
    , vmin=None
    , vmax=None
    , alpha=None
    , linewidths=None
    , verts=None
    , edgecolors=None
    , hold=None
    , data=None
    , **kwargs
)
x, y：对应了平面点的位置，
s：控制点大小，
c：对应颜色指示值，也就是如果采用了渐变色的话，我们设置c=x就能使得点的颜色根据点的x值变化，
cmap：调整渐变色或者颜色列表的种类
marker：控制点的形状
alpha：控制点的透明度
"""
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(
      datingDataMat[:,1]
    , datingDataMat[:,2]
    # 使用datingLabels存储的类别标签，绘制不同的尺寸
    , s=15.0*array(datingLabels)
    # 使用datingLabels存储的类别标签，绘制不同的色彩
    , c=15.0*array(datingLabels)
)
ax.set_title('数据观察', fontproperties=myfont, fontsize=18)
ax.set_xlabel('玩视频游戏所耗时间百分比', fontproperties=myfont)
ax.set_ylabel('每周所消费的冰激凌公升数', fontproperties=myfont)#, rotation=360
plt.show()


def file2matrix(filename):
    love_dictionary={'largeDoses':3, 'smallDoses':2, 'didntLike':1}
    fr = open(filename)
    arrayOLines = fr.readlines()
    numberOfLines = len(arrayOLines)            #get the number of lines in the file
    returnMat = zeros((numberOfLines,3))        #prepare matrix to return
    classLabelVector = []                       #prepare labels return   
    index = 0
    for line in arrayOLines:
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]
        if(listFromLine[-1].isdigit()):
            classLabelVector.append(int(listFromLine[-1]))
        else:
            classLabelVector.append(love_dictionary.get(listFromLine[-1]))
        index += 1
    return returnMat,classLabelVector

datingDataMat, datingLabels = file2matrix('rawdata/datingTestSet.txt')


fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(
      datingDataMat[:,0]
    , datingDataMat[:,1]
    # 使用datingLabels存储的类别标签，绘制不同的尺寸
    , s=15.0*array(datingLabels)
    # 使用datingLabels存储的类别标签，绘制不同的色彩
    , c=15.0*array(datingLabels)
    #, label='W'
)
ax.set_title('数据观察', fontproperties=myfont, fontsize=18)
ax.set_xlabel('每年获取的飞行常客里程数', fontproperties=myfont)
ax.set_ylabel('玩视频游戏所耗时间百分比', fontproperties=myfont)
#plt.legend(prop=myfont,)
plt.show()


# 0 飞行里程
# 1 游戏时间
# 2 冰淇淋数
pd.DataFrame(datingDataMat)


def autoNorm(dataSet):
    # 从每列中选取最小值（索引号为0），组成新的实例
    minVals = dataSet.min(0)
    # 从每列中选取最大值（索引号为0），组成新的实例
    maxVals = dataSet.max(0)
    # 计算每列的差距，返回的也是一个矩阵
    ranges = maxVals - minVals
    # 先创建一个0矩阵，和原矩阵dataSet的维度相同
    normDataSet = zeros(dataSet.shape)
    # 提取原矩阵 dataSet的第一个维度，也就是实例个数
    m = dataSet.shape[0]
    # 矩阵之差
    # 使用 tile 纵向扩展 minVals，m表示扩展的条数，1 表示纵向
    normDataSet = dataSet - tile(minVals, (m,1))
    # 然后再除以 扩展的距离矩阵，实现归一化
    normDataSet = normDataSet / tile(ranges, (m,1))
    return normDataSet, ranges, minVals


normDataSet, ranges, minVals = autoNorm(datingDataMat)


pd.DataFrame(normDataSet)


pd.DataFrame(ranges).T


pd.DataFrame(minVals).T


def datingClassTest(h, k):
    hoRatio = float(h) #0.05
    # 获取全部数据
    datingDataMat, datingLabels = file2matrix('rawdata/datingTestSet.txt')
    # 数值归一化
    normMat, ranges, minVals = autoNorm(datingDataMat)
    # 计算全部实例数目
    m = normMat.shape[0]
    # 测试数据，占全部数据的 10%
    # 备注：前10%的实例为测试集，后90%的实例为训练集
    numTestVecs = int(m*hoRatio)
    # 错误数目（率）置为 0
    errorCount = 0.0
    # 随机地进行 测试数据的数目作为循环的次数
    for i in range(numTestVecs):
        
        # 运行算法函数，获得分类结果
        classifierResult = classify0(
            # 从错误行的 第 0 行开始，每一个向量（即个特征)，作为测试集的一个实例
              normMat[i,:]
            # 从错误记录开始计数的第1行，到最后一行，每一行的全部记录（每1行的3列）
            # 得到一个矩阵，行数为错误行数，列为3列
            , normMat[numTestVecs:m, :]
            # 从错误数第一个的记录到最后一个记录，作为目标变量，即类别
            , datingLabels[numTestVecs:m]
            # k=3， 既取邻近的3个个体进行类比
            , int(k) #=3
        )
        # 没有保存测试结果，而是直接打印出来
        #print("the classifier came back with: {}, the real answer is: {}".format(classifierResult, datingLabels[i]))
        # 如果测试结果，与测试集中的目标变量不相同（即预测错误）
        # 则错误计数器 自增1
        if (classifierResult != datingLabels[i]):
            errorCount += 1.0
    print("the total error rate is: {}".format(errorCount/float(numTestVecs)))  
    print(errorCount)


datingClassTest('0.08', '5')

the total error rate is: 0.025
2.0


def classifyPerson():
    
    resultList = ['not at all', 'in small doses', 'in large doses']
    
    # 交互输入
    percentTats = float(input("percentage of time spent playing video games?    A: "))
    ffMiles = float(input("frequent flier miles earned per year?    A: "))
    iceCream = float(input("liters of ice cream consumed per year?    A: "))
    
    datingDataMat, datingLabels = file2matrix('rawdata/datingTestSet.txt')
    
    # 归一化处理
    normMat, ranges, minVals = autoNorm(datingDataMat)
    
    # 将输入值拼接为一个实例
    inArr = array([ffMiles, percentTats, iceCream, ])
    print('inArr is: ', inArr)
    
    classifierResult = classify0(
        # 将输入实例归一化
          (inArr - minVals)/ranges
        # 规划的训练集
        , normMat
        # 目标变量
        , datingLabels
        # k 值
        , 3
    )
    
    print("You will probably like this person: {}".format(resultList[classifierResult - 1]))


classifyPerson()

percentage of time spent playing video games?    A: 10
frequent flier miles earned per year?    A: 10000
liters of ice cream consumed per year?    A: 0.5
inArr is:  [10000.     10.
     0.5]
You will probably like this person: in small doses


# 将图像（32 X 32位）格式转化为一个向量（1 X 1024）
# 以便分类器（classify0）可以在此场景下使用
def img2vector(filename):
    returnVect = zeros((1,1024))
    fr = open(filename)
    # 嵌套循环，32*32=1024
    for i in range(32):
        # readline()每次读出一行内容
        # 读取时占用内存小，比较适合大文件，返回字符串对象，每次读取时遇到换行符\n就停止
        # 下一次从下一行继续读取
        lineStr = fr.readline()
        for j in range(32):
            # 每一行文件的每一个位置的字符，赋值给创建的0矩阵
            returnVect[0,32*i+j] = int(lineStr[j])
    # 返回新的1维矩阵
    return returnVect


testVector = img2vector('rawdata/ch02/testDigits/0_13.txt')
testVector[0,0:31]

array([0., 0., 0.,
       ..., 0., 0.,
       0.])


testVector[0,32:63]

array([0., 0., 0.,
       ..., 0., 0.,
       0.])


def handwritingClassTest():
    # 创建一个手写识别的空列表
    hwLabels = []
    # 获取训练集文件夹下的文件列表，以备循环之用
    trainingFileList = listdir('rawdata/ch02/trainingDigits')
    # 文件夹下的文件个数
    m = len(trainingFileList)
    # 创建一个训练集的 0 矩阵
    trainingMat = zeros((m,1024))
    # 按照文件个数循环，读取每一个文件
    for i in range(m):
        # 读取当前文件名
        fileNameStr = trainingFileList[i]
        # 将当前文件名，去掉后缀“.txt”
        fileStr = fileNameStr.split('.')[0]
        # 在将文件名拆分的2部分，取第一部分
        # 这部分内容其实是文件（手写内容）的目标变量（类别）
        classNumStr = int(fileStr.split('_')[0])
        # 添加到手写识别的空列表中
        # 此为目标变量（类别）
        hwLabels.append(classNumStr)        
        # 运行img2vector（）将当前文件转换为一维矩阵，以 i 为索引
        # 与此等效：trainingMat[i] = img2vector('rawdata/ch02/trainingDigits/{}'.format(fileNameStr))
        trainingMat[i,:] = img2vector('rawdata/ch02/trainingDigits/{}'.format(fileNameStr))
        # 循环结束，得到一个 m X 1024 的矩阵
        
    # 获取测试集文件夹下的文件列表
    testFileList = listdir('rawdata/ch02/testDigits')
    # 创建一个错误数，初始为0
    errorCount = 0.0
    # 计算测试文件的个数
    mTest = len(testFileList)
    print('mTest is: ', mTest)
    # 按照测试文件个数循环
    for i in range(mTest):
        # 获取测试文件名
        fileNameStr = testFileList[i]
        # 去除后缀 .txt
        fileStr = fileNameStr.split('.')[0]
        # 拆分文件名，取前半部分
        classNumStr = int(fileStr.split('_')[0])
        # 执行函数img2vector， 获取返回的一维矩阵
        vectorUnderTest = img2vector('rawdata/ch02/testDigits/{}'.format(fileNameStr))
        # 为每一条测试实例返回一个测试结果（目标变量）
        classifierResult = classify0(
            # 测试向量
              vectorUnderTest
            # 训练集
            , trainingMat
            , hwLabels
            , 3
        )
        #print("the classifier came back with: {}, the real answer is: {}".format(classifierResult, classNumStr))
        # 同时计算错误率（数目）
        if (classifierResult != classNumStr): errorCount += 1.0
    print("\nthe total number of errors is: {}".format(errorCount))
    print("\nthe total error rate is: {:.4f}%".format(errorCount/float(mTest)))


handwritingClassTest()

mTest is:  946

the total number of errors is: 12.0

the total error rate is: 0.0127%

	0	1	2
0	40920.0	8.326976	0.953952
1	14488.0	7.153469	1.673904
2	26052.0	1.441871	0.805124
3	75136.0	13.147394	0.428964
4	38344.0	1.669788	0.134296
...	...	...	...
995	11145.0	3.410627	0.631838
996	68846.0	9.974715	0.669787
997	26575.0	10.650102	0.866627
998	48111.0	9.134528	0.728045
999	43757.0	7.882601	1.332446

	0	1	2
0	0.448325	0.398051	0.562334
1	0.158733	0.341955	0.987244
2	0.285429	0.068925	0.474496
3	0.823201	0.628480	0.252489
4	0.420102	0.079820	0.078578
...	...	...	...
995	0.122106	0.163037	0.372224
996	0.754287	0.476818	0.394621
997	0.291159	0.509103	0.510795
998	0.527111	0.436655	0.429005
999	0.479408	0.376809	0.785718

k-邻近算法实践

k-邻近算法（KNN）概述¶

准备：使用Python导入数据¶

从文本文件中解析数据¶

Tips：欧式距离¶

如何测试分配器¶

示例：使用 k-邻近算法改进约会网站的配对效果¶

准备数据：从文本文件中解析数据¶

Tips：narray索引¶

分析数据：使用Matplotlib创建散点图¶

准备数据：归一化数值¶

测试算法：作为完整程序验证分类器¶

使用算法：构建完整可用系统¶

示例：手写识别系统¶

准备数据：将图像转换为测试向量¶

测试算法：使用 k-邻近算法识别手写数字¶