from math import log
import operator
import matplotlib.pyplot as plt


# 计算给定数据集的香农熵
def calcShannonEnt(dataSet):
    # 计算数据集中实例的总数
    numEntries = len(dataSet)
    # 创建一个空的字典，存储类别的数目
    # 这个字典的key，是数据集的最后一列，也就是类别（目标变量）
    labelCounts = {}
    # 对每一个实例循环
    for featVec in dataSet:
        # 每一个实例的最后一个值，就是每一个实例的类别
        currentLabel= featVec[-1]
        # 如果该类别不存在于 类别字典 中，则以该类别为key，值为0
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        # 否则，以该类别为 key，值累加 1 （统计出现的次数）
        labelCounts[currentLabel] += 1
    # 
    #print('labelCounts is: ', labelCounts)
    # 声明一个熵    
    shannonEnt = 0.0
    # 对类别循环
    for key in labelCounts:
        # 对应着 key 的每一个类别计数的值，除以数据集的总数
        # 即：每一类的数目占总数的百分比（也就是概率）
        prob = float(labelCounts[key]) / numEntries
        #print('prob is: ', prob)
        #print('log(prob,2) is: ', log(prob,2))
        #print('prob * log(prob,2) is: ', prob * log(prob,2))
        #print('-'*50)
        # 以 2 为底求对数
        # 因为求出的对数是负值，所以累加变成累减
        # log2 0.4 = log2 （2/5） = log2 2 - log2 5 = 1-log2 5
        shannonEnt -= prob * log(prob,2)
    return shannonEnt


# 一个简单鱼坚定数据集
# 数据集是一个嵌套列表
# 前面的列是特征
# 最后一列是类别（目标变量）
def createDataSet():
    dataSet = [
         [1,1,'yes']
        ,[1,1,'yes']
        ,[1,0,'no']
        ,[0,1,'no']
        ,[0,1,'no']
    ]
    labels = ['no surfacing','flippers']
    return dataSet, labels


myDat, labels = createDataSet()


myDat

[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]


labels

['no surfacing', 'flippers']


calcShannonEnt(myDat)

0.9709505944546686


myDat[0][-1]='maybe'
myDat

[[1, 1, 'maybe'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]


calcShannonEnt(myDat)

1.3709505944546687


# 按照给定特征划分数据集
# 原理：当按照某个特征划分数据集时，将所有符合要求的元素抽取出来
# 三个参数
# 1、待划分的数据集
# 2、划分数据集的某个特征
# 3、该特征（第二个参数）的取值范围中的一个（按照范围循环，就可以得到全部）
def splitDataSet(dataSet, axis, value):
    # 创建新的 list 对象
    # 这个列表中的元素，也将是列表
    retDataSet = []
    # 按照每一个实例循环
    for featVec in dataSet:
        #print('featVec[',axis,'] is: ', featVec[axis])
        # 如果该特征的值 等于 给定的特征返回值
        # 也就是说，此时按照这个给定的特征划分数据
        # 如果数据满足这个条件，就开始if代码段内的操作
        if featVec[axis] == value:
            # 抽取该实例当前特征之前的所有特征值
            reducedFeatVec = featVec[:axis]
            #print('reducedFeatVec is: ', reducedFeatVec)
            # 抽取该实例当前特征之后的所有特征值
            # 也就是不包含本特征的其余元素的实例
            # extend方法没有返回值，但会在已存在的列表中添加新的列表内容
            #print('featVec[axis+1:] is: ', featVec[axis+1:])
            reducedFeatVec.extend(featVec[axis+1:])
            #print('reducedFeatVec is: ', reducedFeatVec)
            # 上一步获得的列表，添加到本子列表中
            # 生成带有列表作为元素的列表
            retDataSet.append(reducedFeatVec)
            #print('retDataSet is: ', retDataSet)
    # 可以理解为，返回的是一个满足条件的子矩阵
    # 子子矩阵不包含本特征这一列（列）
    # 也不包含不满足条件的实例（行）
    return retDataSet


myDat, labels = createDataSet()
myDat

[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]


# 第二个参数是0，表示取myDat的第一列（第一个特征）
# 第三个参数取0，表示判断第一列（第一个特征）的取值是否等于0
splitDataSet(myDat, 0, 0)

[[1, 'no'], [1, 'no']]


splitDataSet(myDat, 0, 1)

[[1, 'yes'], [1, 'yes'], [0, 'no']]


splitDataSet(myDat, 1, 0)

[[1, 'no']]


splitDataSet(myDat, 1, 1)

[[1, 'yes'], [1, 'yes'], [0, 'no'], [0, 'no']]


# 信息增益的计算方法
#     先计算子数据集
#     再计算子数据集的概率
#     再计算子数据集的熵
#     子数据集的概率 * 子数据集的熵 = 该特征的熵
#      整体的熵，减去该特征的熵，就是该特征的信息增益
#      (信息增益，是熵的减少，或者是数据无序度的减少)
# 选择最好的数据集划分方式
def chooseBestFeatureToSplit(dataSet):
    # 先使用[0]取出数据集的第一条记录
    # 计算一条记录的长度，因为最后一列是目标变量（类别），所以减1
    # 得到数据集的特征的数目
    numFeatures = len(dataSet[0]) - 1
    #print('>=====----->dataSet[0] is: ', dataSet[0])
    #print('>=====----->numFeatures is: ', numFeatures)
    # 先计算给定数据集的熵，了解该数据集的混乱程度
    baseEntropy = calcShannonEnt(dataSet)
    #print('>=====----->baseEntropy is: ', baseEntropy)
    # 先设定最佳信息增益为 0
    bestInfoGain = 0.0
    # 声明最佳的特征，先配置一个不存在的号，-1
    bestFeature = -1
    # 按照数据集的特征数目循环
    for i in range(numFeatures):
        # 创建当前特征的所有取值（范围）
        # 以便用来计算 信息增益
        featList = [example[i] for example in dataSet]
        # 取值的去重
        uniqueVals = set(featList)
        # 声明一个新的熵，用整体的熵减去该特征的熵，就是该特征的信息增益
        newEntropy = 0.0
        # 在当前特征下，对 取值 循环，
        # 在嵌套循环中计算 该特征的 熵
        for value in uniqueVals:
            # 根据当前特征，遍历该特征的所有取值
            # 返回一个子数据集
            subDataSet = splitDataSet(dataSet, i, value)
            # 子数据集 / 所有数据长度，得到 该子数据集的概率
            # 子集的长度：符合条件的个数
            #print('>=====----->subDataSet is: ', subDataSet)
            #print('>=====----->len(subDataSet) is: ', len(subDataSet))
            #print('>=====----->float(len(dataSet) is: ', float(len(dataSet)))
            prob = len(subDataSet) / float(len(dataSet))
            # 再计算子数据集的熵，然后乘以该子数据集的概率
            # 得到 该特征 的熵
            newEntropy += prob * calcShannonEnt(subDataSet)
            #print('>=====----->newEntropy is: ', newEntropy)
        # 原数据集的熵，减去该特征的熵
        # 得到该特征的 信息增益
        infoGain = baseEntropy - newEntropy
        #print('>=====----->infoGain is: ', infoGain)
        
        # 计算最好的 信息增益
        if (infoGain > bestInfoGain):
            bestInfoGain = infoGain
            #print('>=====----->bestInfoGain is: ', bestInfoGain)
            bestFeature = i
    return bestFeature


myDat, labels = createDataSet()
myDat

[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]


chooseBestFeatureToSplit(myDat)

0


# 递归函数的一个小插件
# 在无法简单地返回唯一的类标签的时候，此函数返回出现次数最多的类别
def majorityCnt(classList):
    classCount = {}
    for vote in classlist:
        # 如果不存在，置 0
        if vote not in classCount.keys():
            classCount[vote] = 0
        # 否则 累加 1（计数）
        # 以类别列表中的值为 key
        classCount[vote] += 1
    # 排序    
    sortedClassCount = sorted(
          classCount.iteritems()
        # 按照第二列（计数的值）排序【第一列为key，也就是类别的值】
        , key = operator.itemgetter(1)
        # 降序
        , reverse = True
    )
    # 返回出现次数最多的分类标签
    return sortedClassCount[0][0]


def createTree(dataSet, labels):
    # 创建数据集的类别的列表
    classList = [example[-1] for example in dataSet]
    print('>>>=====----->classList is: ', classList) 
    # 假如所有的类标签完全相同，则直接返回该类标签
    # list.count(object) 统计某个元素在列表中出现的次数
    #
#     print('>>>=====----->classList.count(classList[0]) is: ', classList.count(classList[0])) 
#     print('>>>=====----->len(classList) is: ', len(classList)) 
#     print('>>>=====----->len(dataSet[0]) is: ', len(dataSet[0])) 
#     print('>>>=====----->dataSet[0] is: ', dataSet[0]) 
    # 
    if classList.count(classList[0]) == len(classList):
        print('首个元素出现的次数与列表长度一致，表明只有一个元素。直接返回该标签。然后退出！')
        return classList[0]
    # 已经分解完了所有的特征，剩下一个是类别
    if len(dataSet[0]) == 1:
        print('所有特征列已经处理完毕，只剩下类别列，所以退出')
        # 则返回出现次数最多的类别
        return majorityCnt(classList)
    
    # 开始创建树
    #
    # 先对数据集，选择出最佳特征的索引
    bestFeat = chooseBestFeatureToSplit(dataSet)
    # 获取最佳特征的名称
    bestFeatLabel = labels[bestFeat]
    # 创建一个数的数组
    # 其中 key 是 类别名称，value 是一个空数组
    myTree = {bestFeatLabel:{}}
    print('决策树当前节点的关键字（类别名称）是: ', myTree)
    # 然后在类别表中删除最佳特征的类别（显然、特征会越来越少）
    # del用于list列表操作，删除一个或者连续几个元素
    # del(data[i]): 删除data中索引为i个数据
    del(labels[bestFeat])
    # 计算特征值
    # 是一个列表，遍历数据集，获得每条实例的最佳索引号对应的特征值
    featValues = [example[bestFeat] for example in dataSet]
    #print('featValues is: ', featValues)
    # 对上述列表去重
    uniqueVals = set(featValues)
    print('uniqueVals is: ', uniqueVals)
    # 遍历上述特征值
    for value in uniqueVals:
        print('*'*70)
        print('当前最佳特征的特征值是: ', value)
        # 提取子类标签的列表
        # Python函数参数是列表时，参数是按照引用方式传递的
        # 为了保证每次调用函数 createTree() 时不改变原始列表的内容
        # 所以使用新变量来代替原始列表
        subLabels = labels[:]
        # 递归调用 创建树 函数
        # 此时，value（即特征值）作为树的分支的 key，获得该key对应的value
        # 该 key 和 value 的组合，作为其父节点的 value
        myTree[bestFeatLabel][value] = createTree(
                  # 返回的是一个子数据集
                  # 该子数据集，不包括bestFeat特征列，也不包括与 Value不想等的特征值
                  # 也就是在剩余的特征和实例中，继续创建子树
                    splitDataSet(
                    dataSet
                  , bestFeat
                  , value
              )
            , subLabels
        )
        print('>', '='*50)
        print('当前最佳特征的类别是: ', bestFeatLabel)
        print('当前节点的值（key与value组合）是: ', myTree[bestFeatLabel][value])
        print('='*50, '<')
    print('嵌套函数，输出实现了外部包围内部。')
    return myTree


myDat, labels = createDataSet()


myTree = createTree(myDat, labels)

>>>=====----->classList is:  ['yes', 'yes', 'no', 'no', 'no']
决策树当前节点的关键字（类别名称）是:  {'no surfacing': {}}
uniqueVals is:  {0, 1}
**********************************************************************
当前最佳特征的特征值是:  0
>>>=====----->classList is:  ['no', 'no']
首个元素出现的次数与列表长度一致，表明只有一个元素。直接返回该标签。然后退出！
> ==================================================
当前最佳特征的类别是:  no surfacing
当前节点的值（key与value组合）是:  no
================================================== <
**********************************************************************
当前最佳特征的特征值是:  1
>>>=====----->classList is:  ['yes', 'yes', 'no']
决策树当前节点的关键字（类别名称）是:  {'flippers': {}}
uniqueVals is:  {0, 1}
**********************************************************************
当前最佳特征的特征值是:  0
>>>=====----->classList is:  ['no']
首个元素出现的次数与列表长度一致，表明只有一个元素。直接返回该标签。然后退出！
> ==================================================
当前最佳特征的类别是:  flippers
当前节点的值（key与value组合）是:  no
================================================== <
**********************************************************************
当前最佳特征的特征值是:  1
>>>=====----->classList is:  ['yes', 'yes']
首个元素出现的次数与列表长度一致，表明只有一个元素。直接返回该标签。然后退出！
> ==================================================
当前最佳特征的类别是:  flippers
当前节点的值（key与value组合）是:  yes
================================================== <
嵌套函数，输出实现了外部包围内部。
> ==================================================
当前最佳特征的类别是:  no surfacing
当前节点的值（key与value组合）是:  {'flippers': {0: 'no', 1: 'yes'}}
================================================== <
嵌套函数，输出实现了外部包围内部。


myTree

{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}


# 定义 文本框 和 箭头格式
# 决策节点
decisionNode = dict(boxstyle='sawtooth', fc='0.9')
# 叶子节点
leafNode = dict(boxstyle='round4', fc='0.7')
arrow_args = dict(arrowstyle='<-')

# 绘制带箭头的注释
# 参数
#   1、节点注释
#   2、终点位置
#   3、起点位置
#   4、节点类型 (decisionNode, leafNode)
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
    createPlot.ax1.annotate(
          nodeTxt
        , xy = parentPt
        , xycoords = 'axes fraction'
        , xytext = centerPt
        , textcoords = 'axes fraction'
        , va = 'center'
        , ha = 'center'
        , bbox = nodeType
        # 箭头格式
        , arrowprops = arrow_args
    )

def createPlot():
    fig = plt.figure(1, facecolor = 'white')
    fig.clf()
    createPlot.ax1 = plt.subplot(111, frameon = False)
    plotNode('A decision node', (0.5, 0.1),(0.1, 0.5), decisionNode)
    plotNode('a leaf node', (0.8, 0.1), (0.3, 0.8), leafNode)
    plt.show()


createPlot()


# 定义新函数，获取“叶节点的数目”（x轴），和“树的层数”（y轴）
def getNumLeafs(myTree):
    numLeafs = 0
    # 第一个字符串是树的类别名称的第一个
    # print('myTree.keys() is: ', myTree.keys())
    firstStr = list(myTree.keys())[0]
    # 对树的key取值，得到一个value
    # 这个value可能是一个值（类别），也可能是一个字段（分支）
    secondDict = myTree[firstStr]
    # 开始遍历判断（所有节点或子节点）
    for key in secondDict.keys():
        # 判断节点的数据类型，是否为字典
        if type(secondDict[key]).__name__ == 'dict':
            # 浅谈
            numLeafs += getNumLeafs(secondDict[key])
        else:
            numLeafs += 1
    return numLeafs

def getTreeDepth(myTree):
    maxDepth = 0
    # 第一个字符串是树的key（也就是类别名称）的第一个
    firstStr = list(myTree.keys())[0]
    # 根据树的第一个key队树取值，可能是一个值，也可能是一个字典
    secondDict = myTree[firstStr]
    # 遍历循环所有节点（子节点）
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            # 嵌套
            thisDepth = 1 + getTreeDepth(secondDict[key])
        else:
            thisDepth = 1
        if thisDepth > maxDepth:
            maxDepth = thisDepth
    return maxDepth


# 为避免麻烦，创建函数输出预先存储的树信息
def retrieveTree(i):
    listOfTrees = [
        {
            'no surfacing': {
                0: 'no',
                1: {
                    'flippers': {
                        0: 'no',
                        1: 'yes'
                    }
                }
            }
        },
        {
            'no surfacing': {
                0: 'no',
                1: {
                    'flippers': {
                        0: {
                            'head': {
                                0: 'no',
                                1: 'yes'
                            }
                        },
                        1: 'no'
                    }
                }
            }
        }
    ]
    return listOfTrees[i]


retrieveTree(1)

{'no surfacing': {0: 'no',
  1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}


retrieveTree(0)

{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}


myTree = retrieveTree(0)


getNumLeafs(myTree)

3


getTreeDepth(myTree)

2


def plotMidText(cntrPt, parentPt, txtString):
    # 在父子节点之间填充文本信息
    xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0]
    yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1]
    createPlot.ax1.text(xMid, yMid, txtString)

def plotTree(myTree, parentPt, nodeTxt):
    # 计算 宽 和 高
    # 树的叶子，表示宽
    numLeafs = getNumLeafs(myTree)
    # 树的层数，表示高
    depth = getTreeDepth(myTree)
    # 树的根节点名称
    firstStr = list(myTree.keys())[0]
    # 计算坐标位置
    # plotTree.totalW 存储树的宽度
    # plotTree.totalD 存储树的高度
    cntrPt = ( 
          plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW
        , plotTree.yOff
    )
    plotMidText(cntrPt, parentPt, nodeTxt)
    plotNode(firstStr, cntrPt, parentPt, decisionNode)
    
    secondDict = myTree[firstStr]
    
    plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
    
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            plotTree(secondDict[key], cntrPt, str(key))
        else:
            plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
            plotNode(
                  secondDict[key]
                , (plotTree.xOff, plotTree.yOff)
                , cntrPt
                , leafNode
            )
            plotMidText(
                  (plotTree.xOff, plotTree.yOff)
                , cntrPt
                , str(key)
            )
    plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD

# 主函数
def createPlot(inTree):
    fig = plt.figure(1, facecolor = 'white')
    fig.clf()
    axprops = dict(xticks=[], yticks=[])
    createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
    plotTree.totalW = float(getNumLeafs(inTree))
    plotTree.totalD = float(getTreeDepth(inTree))
    plotTree.xOff = -0.5 / plotTree.totalW
    plotTree.yOff = 1.0
    plotTree(inTree, (0.5, 1.0), '')
    plt.show()


myTree = retrieveTree(0)


createPlot(myTree)


myTree['no surfacing'][3]='maybe'


createPlot(myTree)


myTree['no surfacing'][4]={'another': {0: {'ano_c': {0: 'no', 1: 'yes'}}, 1: 'yes'}}
myTree

{'no surfacing': {0: 'no',
  1: {'flippers': {0: 'no', 1: 'yes'}},
  3: 'maybe',
  4: {'another': {0: {'ano_c': {0: 'no', 1: 'yes'}}, 1: 'yes'}}}}


createPlot(myTree)


def classify(inputTree, featLabels, testVec):
    firstStr = list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    # 将 根节点 标签字符串 转换为 索引
    # 用于 确定 特征在数据集中的位置
    featIndex = featLabels.index(firstStr)
    # 按类别遍历循环（0,1，...）
    for key in secondDict.keys():
        # 只有符合条件才执行 if 语句。否则报错
        # 测试数据的索引号 等于 根节点索引号
        if testVec[featIndex] == key:
            # 如果 value 是子集，则继续往下找
            if type(secondDict[key]).__name__ == 'dict':
                classLabel = classify(secondDict[key], featLabels, testVec)
            # 否则匹配这个key对应的value，即类别    
            else:
                classLabel = secondDict[key]
    return classLabel


myDat, labels = createDataSet()


myDat

[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]


labels

['no surfacing', 'flippers']


myTree = retrieveTree(0)


myTree

{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}


classify(myTree, labels, [1,0])

'no'


classify(myTree, labels, [1,1])

'yes'


def storeTree(inputTree, filename):
    import pickle
    fw = open(filename, 'wb')
    print(inputTree)
    pickle.dump(inputTree, fw)
    fw.close()

def grabTree(filename):
    import pickle
    fr = open(filename, 'rb')
    return pickle.load(fr)


storeTree(myTree, 'classifierStorage.txt')

{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}


grabTree('classifierStorage.txt')

{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}


fr = open('rawdata/ch03/lenses.txt')
lenses = [inst.strip().split('\t') for inst in fr.readlines()]
lenses

[['young', 'myope', 'no', 'reduced', 'no lenses'],
 ['young', 'myope', 'no', 'normal', 'soft'],
 ['young', 'myope', 'yes', 'reduced', 'no lenses'],
 ['young', 'myope', 'yes', 'normal', 'hard'],
 ['young', 'hyper', 'no', 'reduced', 'no lenses'],
 ['young', 'hyper', 'no', 'normal', 'soft'],
 ['young', 'hyper', 'yes', 'reduced', 'no lenses'],
 ['young', 'hyper', 'yes', 'normal', 'hard'],
 ['pre', 'myope', 'no', 'reduced', 'no lenses'],
 ['pre', 'myope', 'no', 'normal', 'soft'],
 ['pre', 'myope', 'yes', 'reduced', 'no lenses'],
 ['pre', 'myope', 'yes', 'normal', 'hard'],
 ['pre', 'hyper', 'no', 'reduced', 'no lenses'],
 ['pre', 'hyper', 'no', 'normal', 'soft'],
 ['pre', 'hyper', 'yes', 'reduced', 'no lenses'],
 ['pre', 'hyper', 'yes', 'normal', 'no lenses'],
 ['presbyopic', 'myope', 'no', 'reduced', 'no lenses'],
 ['presbyopic', 'myope', 'no', 'normal', 'no lenses'],
 ['presbyopic', 'myope', 'yes', 'reduced', 'no lenses'],
 ['presbyopic', 'myope', 'yes', 'normal', 'hard'],
 ['presbyopic', 'hyper', 'no', 'reduced', 'no lenses'],
 ['presbyopic', 'hyper', 'no', 'normal', 'soft'],
 ['presbyopic', 'hyper', 'yes', 'reduced', 'no lenses'],
 ['presbyopic', 'hyper', 'yes', 'normal', 'no lenses']]


lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate']
lensesTree = createTree(lenses, lensesLabels)
lensesTree

>>>=====----->classList is:  ['no lenses', 'soft', 'no lenses', 'hard', 'no lenses', 'soft', 'no lenses', 'hard', 'no lenses', 'soft', 'no lenses', 'hard', 'no lenses', 'soft', 'no lenses', 'no lenses', 'no lenses', 'no lenses', 'no lenses', 'hard', 'no lenses', 'soft', 'no lenses', 'no lenses']
决策树当前节点的关键字（类别名称）是:  {'tearRate': {}}
uniqueVals is:  {'normal', 'reduced'}
**********************************************************************
当前最佳特征的特征值是:  normal
>>>=====----->classList is:  ['soft', 'hard', 'soft', 'hard', 'soft', 'hard', 'soft', 'no lenses', 'no lenses', 'hard', 'soft', 'no lenses']
决策树当前节点的关键字（类别名称）是:  {'astigmatic': {}}
uniqueVals is:  {'no', 'yes'}
**********************************************************************
当前最佳特征的特征值是:  no
>>>=====----->classList is:  ['soft', 'soft', 'soft', 'soft', 'no lenses', 'soft']
决策树当前节点的关键字（类别名称）是:  {'age': {}}
uniqueVals is:  {'presbyopic', 'pre', 'young'}
**********************************************************************
当前最佳特征的特征值是:  presbyopic
>>>=====----->classList is:  ['no lenses', 'soft']
决策树当前节点的关键字（类别名称）是:  {'prescript': {}}
uniqueVals is:  {'myope', 'hyper'}
**********************************************************************
当前最佳特征的特征值是:  myope
>>>=====----->classList is:  ['no lenses']
首个元素出现的次数与列表长度一致，表明只有一个元素。直接返回该标签。然后退出！
> ==================================================
当前最佳特征的类别是:  prescript
当前节点的值（key与value组合）是:  no lenses
================================================== <
**********************************************************************
当前最佳特征的特征值是:  hyper
>>>=====----->classList is:  ['soft']
首个元素出现的次数与列表长度一致，表明只有一个元素。直接返回该标签。然后退出！
> ==================================================
当前最佳特征的类别是:  prescript
当前节点的值（key与value组合）是:  soft
================================================== <
嵌套函数，输出实现了外部包围内部。
> ==================================================
当前最佳特征的类别是:  age
当前节点的值（key与value组合）是:  {'prescript': {'myope': 'no lenses', 'hyper': 'soft'}}
================================================== <
**********************************************************************
当前最佳特征的特征值是:  pre
>>>=====----->classList is:  ['soft', 'soft']
首个元素出现的次数与列表长度一致，表明只有一个元素。直接返回该标签。然后退出！
> ==================================================
当前最佳特征的类别是:  age
当前节点的值（key与value组合）是:  soft
================================================== <
**********************************************************************
当前最佳特征的特征值是:  young
>>>=====----->classList is:  ['soft', 'soft']
首个元素出现的次数与列表长度一致，表明只有一个元素。直接返回该标签。然后退出！
> ==================================================
当前最佳特征的类别是:  age
当前节点的值（key与value组合）是:  soft
================================================== <
嵌套函数，输出实现了外部包围内部。
> ==================================================
当前最佳特征的类别是:  astigmatic
当前节点的值（key与value组合）是:  {'age': {'presbyopic': {'prescript': {'myope': 'no lenses', 'hyper': 'soft'}}, 'pre': 'soft', 'young': 'soft'}}
================================================== <
**********************************************************************
当前最佳特征的特征值是:  yes
>>>=====----->classList is:  ['hard', 'hard', 'hard', 'no lenses', 'hard', 'no lenses']
决策树当前节点的关键字（类别名称）是:  {'prescript': {}}
uniqueVals is:  {'myope', 'hyper'}
**********************************************************************
当前最佳特征的特征值是:  myope
>>>=====----->classList is:  ['hard', 'hard', 'hard']
首个元素出现的次数与列表长度一致，表明只有一个元素。直接返回该标签。然后退出！
> ==================================================
当前最佳特征的类别是:  prescript
当前节点的值（key与value组合）是:  hard
================================================== <
**********************************************************************
当前最佳特征的特征值是:  hyper
>>>=====----->classList is:  ['hard', 'no lenses', 'no lenses']
决策树当前节点的关键字（类别名称）是:  {'age': {}}
uniqueVals is:  {'presbyopic', 'pre', 'young'}
**********************************************************************
当前最佳特征的特征值是:  presbyopic
>>>=====----->classList is:  ['no lenses']
首个元素出现的次数与列表长度一致，表明只有一个元素。直接返回该标签。然后退出！
> ==================================================
当前最佳特征的类别是:  age
当前节点的值（key与value组合）是:  no lenses
================================================== <
**********************************************************************
当前最佳特征的特征值是:  pre
>>>=====----->classList is:  ['no lenses']
首个元素出现的次数与列表长度一致，表明只有一个元素。直接返回该标签。然后退出！
> ==================================================
当前最佳特征的类别是:  age
当前节点的值（key与value组合）是:  no lenses
================================================== <
**********************************************************************
当前最佳特征的特征值是:  young
>>>=====----->classList is:  ['hard']
首个元素出现的次数与列表长度一致，表明只有一个元素。直接返回该标签。然后退出！
> ==================================================
当前最佳特征的类别是:  age
当前节点的值（key与value组合）是:  hard
================================================== <
嵌套函数，输出实现了外部包围内部。
> ==================================================
当前最佳特征的类别是:  prescript
当前节点的值（key与value组合）是:  {'age': {'presbyopic': 'no lenses', 'pre': 'no lenses', 'young': 'hard'}}
================================================== <
嵌套函数，输出实现了外部包围内部。
> ==================================================
当前最佳特征的类别是:  astigmatic
当前节点的值（key与value组合）是:  {'prescript': {'myope': 'hard', 'hyper': {'age': {'presbyopic': 'no lenses', 'pre': 'no lenses', 'young': 'hard'}}}}
================================================== <
嵌套函数，输出实现了外部包围内部。
> ==================================================
当前最佳特征的类别是:  tearRate
当前节点的值（key与value组合）是:  {'astigmatic': {'no': {'age': {'presbyopic': {'prescript': {'myope': 'no lenses', 'hyper': 'soft'}}, 'pre': 'soft', 'young': 'soft'}}, 'yes': {'prescript': {'myope': 'hard', 'hyper': {'age': {'presbyopic': 'no lenses', 'pre': 'no lenses', 'young': 'hard'}}}}}}
================================================== <
**********************************************************************
当前最佳特征的特征值是:  reduced
>>>=====----->classList is:  ['no lenses', 'no lenses', 'no lenses', 'no lenses', 'no lenses', 'no lenses', 'no lenses', 'no lenses', 'no lenses', 'no lenses', 'no lenses', 'no lenses']
首个元素出现的次数与列表长度一致，表明只有一个元素。直接返回该标签。然后退出！
> ==================================================
当前最佳特征的类别是:  tearRate
当前节点的值（key与value组合）是:  no lenses
================================================== <
嵌套函数，输出实现了外部包围内部。

{'tearRate': {'normal': {'astigmatic': {'no': {'age': {'presbyopic': {'prescript': {'myope': 'no lenses',
        'hyper': 'soft'}},
      'pre': 'soft',
      'young': 'soft'}},
    'yes': {'prescript': {'myope': 'hard',
      'hyper': {'age': {'presbyopic': 'no lenses',
        'pre': 'no lenses',
        'young': 'hard'}}}}}},
  'reduced': 'no lenses'}}


createPlot(lensesTree)

决策树实践

决策树的构造¶

信息增益 information gain¶

Tips：基尼不纯度 Gini impurity¶

划分数据集¶

递归构建决策树¶

在Python中使用Matplotlib注解绘制树形图¶

Matplotlib注解¶

构造注解树¶

测试和存储分类器¶

测试算法：使用决策树执行分类¶

使用算法：决策树的存储¶

示例：使用决策树预测隐形眼镜类型¶