import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from numpy import *
#禁用科学计数法
np.set_printoptions(suppress=True,precision=10,threshold=2000,linewidth=150)  
pd.set_option('display.float_format',lambda x : '%.2f' % x)


def sigmoid_sample(x):
    return 1.0/(1+np.exp(-x))
 
sigmoid_inputs = np.arange(-5,5,0.1)
print(type(sigmoid_inputs))
print(sigmoid_inputs)
sigmoid_outputs = sigmoid_sample(sigmoid_inputs)
plt.plot(sigmoid_inputs,sigmoid_outputs)
plt.xlabel("Sigmoid Inputs")
plt.ylabel("Sigmoid Outputs")
plt.show()

<class 'numpy.ndarray'>
[-5.  -4.9 -4.8 -4.7 -4.6 -4.5 -4.4 -4.3 -4.2 -4.1 -4.  -3.9 -3.8 -3.7 -3.6 -3.5 -3.4 -3.3 -3.2 -3.1 -3.  -2.9 -2.8 -2.7 -2.6 -2.5 -2.4 -2.3 -2.2
 -2.1 -2.  -1.9 -1.8 -1.7 -1.6 -1.5 -1.4 -1.3 -1.2 -1.1 -1.  -0.9 -0.8 -0.7 -0.6 -0.5 -0.4 -0.3 -0.2 -0.1 -0.   0.1  0.2  0.3  0.4  0.5  0.6  0.7
  0.8  0.9  1.   1.1  1.2  1.3  1.4  1.5  1.6  1.7  1.8  1.9  2.   2.1  2.2  2.3  2.4  2.5  2.6  2.7  2.8  2.9  3.   3.1  3.2  3.3  3.4  3.5  3.6
  3.7  3.8  3.9  4.   4.1  4.2  4.3  4.4  4.5  4.6  4.7  4.8  4.9]


sigmoid_inputs = np.arange(-60,60,0.1)
sigmoid_outputs = sigmoid_sample(sigmoid_inputs)
plt.plot(sigmoid_inputs,sigmoid_outputs)
plt.xlabel("Sigmoid Inputs")
plt.ylabel("Sigmoid Outputs")
plt.show()


# 获得训练数据
# 返回特征值矩阵，和 类别向量
def loadDataSet():
    dataMat = []
    labelMat = []
    fr = open('rawdata/ch05/testSet.txt')
    for line in fr.readlines():
        ##print(line)
        lineArr = line.strip().split()
        ##print(lineArr)
        # 设定 X0 = 1.0
        dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
        labelMat.append(int(lineArr[2]))
#     print(dataMat)
#     print('|'*100)    
#     t1 = mat(dataMat).transpose()
#     print(t1)
#     print('|'*100)
#     print(labelMat)
#     t2 = mat(labelMat).transpose()
#     print(t2)
#     print('|'*100)    
    return dataMat, labelMat


dataArr, labelMat = loadDataSet()


# + 1e-8 是为了避免分母为0
def sigmoid(inX):
    return (1.0/(1+exp(-inX)+ 1e-8)) 
# def sigmoid(inX):
#     if inX >= 0:      #对sigmoid函数的优化，避免了出现极大的数据溢出
#         return (1.0/(1+exp(-inX))) + 1e-8
#     else:
#         return (exp(inX)/(1+exp(inX)))+ 1e-8


# Tips: mat
#将Python的列表转换成NumPy的矩阵
list=[[1,2,3],[4,5,6],[7,8,9]]
mat(list)

matrix([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])


# Tips: mat
#Numpy dnarray转换成Numpy矩阵
n = np.array([[1,2,3],[4,5,6],[7,8,9]])
np.mat(n)

matrix([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])


def gradAscent(dataMatIn, classLabels):
    # 将输入的特征向量转换为numpy矩阵
    dataMatrix = mat(dataMatIn)
    # 将类别向量也转换为numpy矩阵，同时转置，将行向量转换为列向量
    labelMat = mat(classLabels).transpose()
    # 获得特征向量的维度值
    # m: 第一维度，矩阵的行数，也是实例数
    # n，第二维度，矩阵的列数，也是特征的数目
    m, n = shape(dataMatrix)
    
    # 向目标移动的步长
    alpha = 0.001
    
    # 迭代次数
    maxCycles = 500
    
    # 组合一个参数向量（回归系数向量）
    # 1向量，有多少个特征，就有多少行
    # 只有 1 列
    weights = ones((n,1))
    
#     print('dataMatrix.shape is: \n', dataMatrix[:5], '\n')
#     print('weights.shape is: \n', weights, '\n')
#     ttt = dataMatrix * weights
#     print('(weights*weights) is: \n', ttt[:5], '\n')
#     print('h，也即sigmoid(weights*weights) is: \n', sigmoid(ttt)[:5], '\n')
#     print('labelMat is: \n', labelMat[:5], '\n')
#     print('(error，也即：labelMat - sigmoid(weights*weights)) is: \n', (labelMat - sigmoid(ttt))[:5], '\n')

#     print('dataMatrix.transpose() is: ', dataMatrix.transpose())
#     print('dataMatrix.transpose().shape is: ', dataMatrix.transpose().shape)
#     print('(labelMat - sigmoid(weights*weights)).shape is: ', (labelMat - sigmoid(ttt)).shape)
#     #print('dataMatrix.transpose() is: ', dataMatrix.transpose())
#     print('alpha * dataMatrix.transpose() * error is: ', alpha * dataMatrix.transpose() * (labelMat - sigmoid(ttt)))

    ## 迭代指定的次数
    for k in range(maxCycles):
        # 开始矩阵运算
        # h 是一个一维列向量，其元素个数为 100
        # h 的运算包含了300次的乘积（100行、3列）
        # dataMatrix.shape is:  (100, 3)
        # weights.shape is:  (3, 1)
        # (weights*weights).shape is:  (100, 1)
        # h 相当于是向量dataMatrix中，每个实例中，X0+X1+X2，然后组成一个新的向量 (100, 1)
        # 然后使用 sigmoid 函数 归一化
        h = sigmoid(dataMatrix * weights)
        # 计算真实类别与预测类别的差值
        error = (labelMat - h)
        # 按照该差值的方向，调整回归系数
        # dataMatrix.transpose().shape is:  (3, 100)
        # (labelMat - sigmoid(weights*weights)).shape is:  (100, 1)
        # (3, 100) * (100, 1) = (3, 1)
        # weights.shape is:  (3, 1)
        weights = weights + alpha * dataMatrix.transpose() * error
    # 转换为list便于观察    
    print('h = ', np.array(h).tolist())
    print('error = ', np.array(error).tolist())
    # 返回训练好的 回归系数    
    return weights


dataArr, labelMat = loadDataSet()
# 获得系数（回归参数），是一个向量
weights = gradAscent(dataArr, labelMat)
weights

h =  [[0.010453885014499422], [0.6403882362951788], [0.4327429963253981], [0.28437375674838095], [0.07653053217649868], [0.4898566481690915], [0.031887875302845], [0.2154422629207751], [0.18360634799427475], [0.08948867854479377], [0.5653556608497552], [0.033747224345025965], [0.938952938925223], [0.0876783009394775], [0.5756857797631882], [0.9730877911836482], [0.5785968469585414], [0.9201549201878283], [0.9770770865159203], [0.7427525170744756], [0.9561362182077201], [0.9747142536136103], [0.03178037119825938], [0.9932343839182572], [0.8106245265590181], [0.22957520967818754], [0.13705259762855473], [0.9667783715977883], [0.8909122561418292], [0.24433184418249274], [0.8140133131188175], [0.32623784257766086], [0.08059730935331155], [0.9930164521965504], [0.8378635816951857], [0.16381661895842392], [0.19683226621949174], [0.0648812809927683], [0.016268017147865063], [0.031701049670393944], [0.8669635425376012], [0.7021240156133962], [0.07322381465687249], [0.9870723238676774], [0.9462756710677251], [0.04853496326192164], [0.9941064368848221], [0.6411645355848413], [0.030391882773215962], [0.08171567096602954], [0.23747039963667427], [0.082032180435612], [0.2616363887383896], [0.018943829672773426], [0.8512723603033551], [0.48870459650927567], [0.0659417900659263], [0.5546968995043094], [0.030887956316657812], [0.7349249596325498], [0.7645522804010193], [0.9588101010649962], [0.1858501178320481], [0.032878720625468344], [0.03608130756704987], [0.9903907290376559], [0.7929044763657996], [0.08224316693067246], [0.044241989574433004], [0.045578898453716744], [0.06669781689137282], [0.9604686679201407], [0.011976782445359572], [0.9070324010322088], [0.15121501368087523], [0.4851168018203232], [0.9658417472788242], [0.9681889536506088], [0.8821391488948144], [0.7339719646582871], [0.4156910643083357], [0.9961870325644673], [0.20672503984599824], [0.5953041577919764], [0.9277436098558537], [0.9917474025737162], [0.7927990454388322], [0.04778261563345348], [0.8307601289214341], [0.984039965184515], [0.990092647903256], [0.055827614186581824], [0.14291222058807557], [0.9737296358897077], [0.9544447671471007], [0.9463445570509886], [0.10852691844787676], [0.9521946770859797], [0.2745579871301019], [0.008061542683973575]]
error =  [[-0.010453885014499422], [0.3596117637048212], [-0.4327429963253981], [-0.28437375674838095], [-0.07653053217649868], [0.5101433518309084], [-0.031887875302845], [0.7845577370792249], [-0.18360634799427475], [-0.08948867854479377], [0.4346443391502448], [-0.033747224345025965], [0.061047061074776976], [-0.0876783009394775], [0.42431422023681176], [0.026912208816351813], [0.4214031530414586], [0.0798450798121717], [0.02292291348407971], [0.25724748292552435], [0.0438637817922799], [0.025285746386389696], [-0.03178037119825938], [0.0067656160817427535], [0.18937547344098193], [-0.22957520967818754], [-0.13705259762855473], [0.033221628402211656], [0.10908774385817077], [-0.24433184418249274], [0.18598668688118247], [0.6737621574223391], [-0.08059730935331155], [0.0069835478034495635], [0.1621364183048143], [-0.16381661895842392], [-0.19683226621949174], [-0.0648812809927683], [-0.016268017147865063], [-0.031701049670393944], [0.13303645746239878], [0.2978759843866038], [-0.07322381465687249], [0.012927676132322596], [0.053724328932274856], [-0.04853496326192164], [0.005893563115177858], [0.3588354644151587], [-0.030391882773215962], [-0.08171567096602954], [-0.23747039963667427], [-0.082032180435612], [-0.2616363887383896], [-0.018943829672773426], [0.14872763969664493], [0.5112954034907243], [-0.0659417900659263], [0.44530310049569055], [-0.030887956316657812], [0.2650750403674502], [0.2354477195989807], [0.0411898989350038], [-0.1858501178320481], [-0.032878720625468344], [-0.03608130756704987], [0.009609270962344096], [0.20709552363420036], [-0.08224316693067246], [-0.044241989574433004], [-0.045578898453716744], [-0.06669781689137282], [0.03953133207985926], [-0.011976782445359572], [0.09296759896779117], [-0.15121501368087523], [-0.4851168018203232], [0.034158252721175786], [0.03181104634939125], [0.11786085110518563], [0.26602803534171293], [-0.4156910643083357], [0.0038129674355327214], [-0.20672503984599824], [0.40469584220802357], [0.07225639014414631], [0.008252597426283814], [0.20720095456116783], [-0.04778261563345348], [0.16923987107856586], [0.015960034815485025], [0.009907352096743982], [-0.055827614186581824], [-0.14291222058807557], [0.026270364110292266], [0.04555523285289931], [0.05365544294901137], [-0.10852691844787676], [0.04780532291402029], [-0.2745579871301019], [-0.008061542683973575]]

matrix([[ 4.1241435695],
        [ 0.4800732934],
        [-0.6168482053]])


def plotBestFit(dataMat1,labelMat1,weights):
    import matplotlib.pyplot as plt  
#     dataArr, labelMat = loadDtataSet()
#     weights = gradAscent(dataArr, labelMat)

    dataArr = np.array(dataMat1)
    #print('dataArr = ', dataArr)
    n = np.shape(dataArr)[0]
    #print('n = ', n)

    xcord1 = []; ycord1 = [];
    xcord2 = []; ycord2 = [];
    
    for i in range(n):
        if int(labelMat[i]==1):
            xcord1.append(dataArr[i,1])
            ycord1.append(dataArr[i,2])
        else:
            xcord2.append(dataArr[i,1])
            ycord2.append(dataArr[i,2])
    #print(xcord2, '|', ycord2)            
    #print(xcord1, '|', ycord1)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(xcord1,ycord1,s=30,c='red',marker='s')
    ax.scatter(xcord2,ycord2,s=30,c='green')
    
    # X轴的数目
    x = np.arange(-2.5,2.5,0.1)
    # Y轴的数目，也就是保证与X轴的数目相同
    x_y = len(x)
    
    # ❶  最佳拟合直线
    # 对于 Sigmoid函数，0 是两个分类（0和1）的分界处
    # 所以设定 0 = w0x0 + w1x1 + w2x2
    # 即，先移项，再等式两边除以 w2
    # 然后解出X2和X1的关系式（分隔线的方程，注意X0=1）
    y1 = (-weights[0]-weights[1]*x)/weights[2] 
    #print('weights= ', weights[0],'|',weights[1],'|',weights[2])
    #print('y1= ', y1)
    y = y1.reshape(x_y,1)
    #print('y= ', y)
    ax.plot(x,y)
    
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.show()


plotBestFit(np.array(dataArr),np.array(labelMat),weights)


def stocGradAscent0(dataMatrix, classLabels):
    m,n = shape(dataMatrix)
    alpha = 0.01
    weights = ones(n)
    y1 = []
    y2 = []
    y3 = []    
#     for k in range(200):
    for i in range(m):
        # 梯度上升算法是：h = sigmoid(dataMatrix * weights)
        # 与随机梯度算法不一样
#         print('=================【 ', i, ' 】================')
#         print('dataMatrix[i]*weights = ', dataMatrix[i]*weights)
#         print('sum(dataMatrix[i]*weights) = ', sum(dataMatrix[i]*weights))
        h = sigmoid(sum(dataMatrix[i]*weights))
#         print('dataMatrix[i] = ', dataMatrix[i])
#         print('weights = ', weights)
#         print('h = ', h)
        error = classLabels[i] - h
#         print('classLabels[i] = ', classLabels[i])
#         print('error = ', error)
#         print('alpha*error*dataMatrix[i] = ', alpha*error*dataMatrix[i])
        weights = weights + alpha*error*dataMatrix[i]
        y1.append(weights[0])
        y2.append(weights[1])
        y3.append(weights[2])
    return weights,y1,y2,y3


dataArr2, labelMat2 = loadDataSet()


weights2,y21,y22,y23 = stocGradAscent0(array(dataArr2), labelMat2)


plotBestFit(np.array(dataArr2),np.array(labelMat2),weights2)


def stocGradAscent1_b(dataMatrix, classLabels, numIter=1):
    m,n = shape(dataMatrix)
    weights = ones(n)
    y1 = []
    y2 = []
    y3 = []
    for j in range(numIter):
        data_index = np.array(range(m)).tolist()
        for i in range(m):

            # 每次调整alpha的值
            alpha = 4 / (1.0 + j + i) + 0.0001
            # 随机选取样本来更新回归系数
            # 目的是减少周期性的波动
            rand_index = int(random.uniform(0, len(data_index)))
            h = sigmoid(sum(dataMatrix[rand_index]*weights))
            error = classLabels[rand_index] - h
            weights = weights + alpha*error*dataMatrix[rand_index]
            y1.append(weights[0])
            y2.append(weights[1])
            y3.append(weights[2])
    return weights,y1,y2,y3


weights3,y1,y2,y3 = stocGradAscent1_b(array(dataArr2), labelMat2)


plotBestFit(np.array(dataArr2),np.array(labelMat2),weights3)


def plotBestFit2(y1,y2,y3):
    import matplotlib.pyplot as plt  
    
    # X轴的数目
    x = np.arange(0,len(y1),1)

    fig = plt.figure(figsize=(16,8))
    ax = fig.add_subplot(111)
    
    # ❶  最佳拟合直线
    ax.plot(x,y1,'g')
    ax.plot(x,y2,'r')
    ax.plot(x,y3,'y')    
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.show()


plotBestFit2(y21,y22,y23)


plotBestFit2(y1,y2,y3)


import numpy as np
import random
import warnings

#suppress warnings
warnings.filterwarnings('ignore')


# sigmoid阶跃函数
# def sigmoid(inX):
#     # return 1.0 / (1 + exp(-inX))
#     return 1.0/(1+np.exp(-inX)+1e-8) 

def sigmoid(x):
    if x>=0:      
        return 1.0/(1+np.power(np.e,-x))
    else:#为负数的时候，对sigmoid函数的优化，避免了出现极大的数据溢出
        return np.power(np.e,x)/(1+np.power(np.e,x))

# 随机梯度上升算法（随机化）
def stocGradAscent1(dataMatIn, classLabels, numIter=150):
    m,n = np.shape(dataMatIn)
     # 创建与列数相同的矩阵的系数矩阵，1行3列
    weights = np.ones(n)  
    # 随机梯度, 循环150,观察是否收敛
    for j in range(numIter):
        # [0, 1, 2 .. m-1]
        dataIndex = []
        for r in range(m):
            dataIndex.append(r)
            
        for i in range(m):
            # i和j的不断增大，导致alpha的值不断减少，但是不为0
            alpha = 4/(1.0+j+i)+0.0001    # alpha 会随着迭代不断减小，但永远不会减小到0，因为后边还有一个常数项0.0001
            
            # 随机产生一个 0～len()之间的一个值
            # random.uniform(x, y) 方法将随机生成下一个实数，它在[x,y]范围内,x是这个范围内的最小值，y是这个范围内的最大值。
            randIndex = int(random.uniform(0,len(dataIndex)))
            
            # sum(dataMatrix[randIndex]*weights)为了求 f(x)的值， f(x)=a1*x1+b2*x2+..+nn*xn
            h = sigmoid(sum(dataMatIn[dataIndex[randIndex]]*weights))
            error = classLabels[dataIndex[randIndex]] - h
            weights = weights + alpha * error * dataMatIn[dataIndex[randIndex]]
            del(dataIndex[randIndex])
    return weights


# 分类函数，根据回归系数和特征向量来计算 Sigmoid的值
def classifyVector(inX, weights):
    '''
    Desc: 
        最终的分类函数，根据回归系数和特征向量来计算 Sigmoid 的值，大于0.5函数返回1，否则返回0
    Args:
        inX -- 特征向量，features
        weights -- 根据梯度下降/随机梯度下降 计算得到的回归系数
    Returns:
        如果 prob 计算大于 0.5 函数返回 1
        否则返回 0
    '''
    prob = sigmoid(sum(inX*weights))
    if prob > 0.5: return 1.0
    else: return 0.0
    
# 打开测试集和训练集,并对数据进行格式化处理
def colicTest():
    '''
    Desc:
        打开测试集和训练集，并对数据进行格式化处理
    Args:
        None
    Returns:
        errorRate -- 分类错误率
    '''
    frTrain = open('rawdata/ch05/horseColicTraining.txt')
    frTest = open('rawdata/ch05/horseColicTest.txt')
    trainingSet = []
    trainingLabels = []
    # 解析训练数据集中的数据特征和Labels
    # trainingSet 中存储训练数据集的特征
    # trainingLabels 存储训练数据集的样本对应的分类标签
    for line in frTrain.readlines():
        currLine = line.strip().split('\t')
        lineArr = []
        # 训练集中有21个特征列，所以使用range(21)
        for i in range(21):
            lineArr.append(float(currLine[i]))
        # 添加特征值到list中    
        trainingSet.append(lineArr)
        # 添加标签类别到list中
        trainingLabels.append(float(currLine[21]))
    # 使用 改进后的 随机梯度下降算法 求得在此数据集上的最佳回归系数 trainWeights
    trainWeights = stocGradAscent1(np.array(trainingSet), trainingLabels, 500)
    errorCount = 0
    numTestVec = 0.0
    # 读取 测试数据集 进行测试，计算分类错误的样本条数和最终的错误率
    for line in frTest.readlines():
        # 总次数（样本数）累加
        numTestVec += 1.0
        currLine = line.strip().split('\t')
        lineArr = []
        for i in range(21):
            lineArr.append(float(currLine[i]))
        if int(classifyVector(np.array(lineArr), trainWeights)) != int(currLine[21]):
            errorCount += 1
    errorRate = (float(errorCount) / numTestVec)
    print ("the error rate of this test is: %f" % errorRate)
    return errorRate    

# 调用 colicTest() 10次并求结果的平均值
def multiTest():
    numTests = 10
    errorSum = 0.0
    for k in range(numTests):
        errorSum += colicTest()
    print ("after %d iterations the average error rate is: %f" % (numTests, errorSum/float(numTests))) 
    
multiTest()

the error rate of this test is: 0.253731
the error rate of this test is: 0.343284
the error rate of this test is: 0.298507
the error rate of this test is: 0.283582
the error rate of this test is: 0.298507
the error rate of this test is: 0.298507
the error rate of this test is: 0.313433
the error rate of this test is: 0.358209
the error rate of this test is: 0.388060
the error rate of this test is: 0.298507
after 10 iterations the average error rate is: 0.313433

Logistic回归

小知识¶

Logistic回归的一般过程¶

基于Logistic回归和Sigmoid函数的分类¶

分类器的函数形式¶

Tips：阶跃函数¶

实现Logistic回归分类器¶

基于最优化方法的最佳回归系数确定¶

梯度上升法¶

Tips：梯度下降算法¶

训练算法：使用梯度上升找到最佳参数¶

分析数据：画出决策边界¶

训练算法：随机梯度上升¶

示例从疝气病症预测病马的死亡率¶

准备数据：处理数据中的缺失值¶

测试算法：用Logistic回归进行分类¶

Logistic回归

小知识¶

Logistic回归的一般过程¶

基于Logistic回归和Sigmoid函数的分类¶

分类器的函数形式¶

Tips：阶跃函数¶

实现Logistic回归分类器¶

基于最优化方法的最佳回归系数确定¶

梯度上升法¶

Tips：梯度下降算法¶

训练算法：使用梯度上升找到最佳参数¶

分析数据：画出决策边界¶

训练算法：随机梯度上升¶

示例 从疝气病症预测病马的死亡率¶

准备数据：处理数据中的缺失值¶

测试算法：用Logistic回归进行分类¶

示例从疝气病症预测病马的死亡率¶