import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from numpy import *
#禁用科学计数法
np.set_printoptions(suppress=True,precision=10,threshold=2000,linewidth=150)
pd.set_option('display.float_format',lambda x : '%.2f' % x)
def sigmoid_sample(x):
return 1.0/(1+np.exp(-x))
sigmoid_inputs = np.arange(-5,5,0.1)
print(type(sigmoid_inputs))
print(sigmoid_inputs)
sigmoid_outputs = sigmoid_sample(sigmoid_inputs)
plt.plot(sigmoid_inputs,sigmoid_outputs)
plt.xlabel("Sigmoid Inputs")
plt.ylabel("Sigmoid Outputs")
plt.show()
<class 'numpy.ndarray'> [-5. -4.9 -4.8 -4.7 -4.6 -4.5 -4.4 -4.3 -4.2 -4.1 -4. -3.9 -3.8 -3.7 -3.6 -3.5 -3.4 -3.3 -3.2 -3.1 -3. -2.9 -2.8 -2.7 -2.6 -2.5 -2.4 -2.3 -2.2 -2.1 -2. -1.9 -1.8 -1.7 -1.6 -1.5 -1.4 -1.3 -1.2 -1.1 -1. -0.9 -0.8 -0.7 -0.6 -0.5 -0.4 -0.3 -0.2 -0.1 -0. 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2. 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3. 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 4. 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9]
sigmoid_inputs = np.arange(-60,60,0.1)
sigmoid_outputs = sigmoid_sample(sigmoid_inputs)
plt.plot(sigmoid_inputs,sigmoid_outputs)
plt.xlabel("Sigmoid Inputs")
plt.ylabel("Sigmoid Outputs")
plt.show()
- 每个回归系数初始化为 1
- 重复 R 次
- 计算整个数据集的梯度
- 使用 $alpha \color{red}{\times} gradient$ 更新回归系数的向量
- 返回回归系数
# 获得训练数据
# 返回特征值矩阵,和 类别向量
def loadDataSet():
dataMat = []
labelMat = []
fr = open('rawdata/ch05/testSet.txt')
for line in fr.readlines():
##print(line)
lineArr = line.strip().split()
##print(lineArr)
# 设定 X0 = 1.0
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
labelMat.append(int(lineArr[2]))
# print(dataMat)
# print('|'*100)
# t1 = mat(dataMat).transpose()
# print(t1)
# print('|'*100)
# print(labelMat)
# t2 = mat(labelMat).transpose()
# print(t2)
# print('|'*100)
return dataMat, labelMat
dataArr, labelMat = loadDataSet()
# + 1e-8 是为了避免分母为0
def sigmoid(inX):
return (1.0/(1+exp(-inX)+ 1e-8))
# def sigmoid(inX):
# if inX >= 0: #对sigmoid函数的优化,避免了出现极大的数据溢出
# return (1.0/(1+exp(-inX))) + 1e-8
# else:
# return (exp(inX)/(1+exp(inX)))+ 1e-8
# Tips: mat
#将Python的列表转换成NumPy的矩阵
list=[[1,2,3],[4,5,6],[7,8,9]]
mat(list)
matrix([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
# Tips: mat
#Numpy dnarray转换成Numpy矩阵
n = np.array([[1,2,3],[4,5,6],[7,8,9]])
np.mat(n)
matrix([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
def gradAscent(dataMatIn, classLabels):
# 将输入的特征向量转换为numpy矩阵
dataMatrix = mat(dataMatIn)
# 将类别向量也转换为numpy矩阵,同时转置,将行向量转换为列向量
labelMat = mat(classLabels).transpose()
# 获得特征向量的维度值
# m: 第一维度,矩阵的行数,也是实例数
# n,第二维度,矩阵的列数,也是特征的数目
m, n = shape(dataMatrix)
# 向目标移动的步长
alpha = 0.001
# 迭代次数
maxCycles = 500
# 组合一个参数向量(回归系数向量)
# 1向量,有多少个特征,就有多少行
# 只有 1 列
weights = ones((n,1))
# print('dataMatrix.shape is: \n', dataMatrix[:5], '\n')
# print('weights.shape is: \n', weights, '\n')
# ttt = dataMatrix * weights
# print('(weights*weights) is: \n', ttt[:5], '\n')
# print('h,也即sigmoid(weights*weights) is: \n', sigmoid(ttt)[:5], '\n')
# print('labelMat is: \n', labelMat[:5], '\n')
# print('(error,也即:labelMat - sigmoid(weights*weights)) is: \n', (labelMat - sigmoid(ttt))[:5], '\n')
# print('dataMatrix.transpose() is: ', dataMatrix.transpose())
# print('dataMatrix.transpose().shape is: ', dataMatrix.transpose().shape)
# print('(labelMat - sigmoid(weights*weights)).shape is: ', (labelMat - sigmoid(ttt)).shape)
# #print('dataMatrix.transpose() is: ', dataMatrix.transpose())
# print('alpha * dataMatrix.transpose() * error is: ', alpha * dataMatrix.transpose() * (labelMat - sigmoid(ttt)))
## 迭代指定的次数
for k in range(maxCycles):
# 开始矩阵运算
# h 是一个一维列向量,其元素个数为 100
# h 的运算包含了300次的乘积(100行、3列)
# dataMatrix.shape is: (100, 3)
# weights.shape is: (3, 1)
# (weights*weights).shape is: (100, 1)
# h 相当于是向量dataMatrix中,每个实例中,X0+X1+X2,然后组成一个新的向量 (100, 1)
# 然后使用 sigmoid 函数 归一化
h = sigmoid(dataMatrix * weights)
# 计算真实类别与预测类别的差值
error = (labelMat - h)
# 按照该差值的方向,调整回归系数
# dataMatrix.transpose().shape is: (3, 100)
# (labelMat - sigmoid(weights*weights)).shape is: (100, 1)
# (3, 100) * (100, 1) = (3, 1)
# weights.shape is: (3, 1)
weights = weights + alpha * dataMatrix.transpose() * error
# 转换为list便于观察
print('h = ', np.array(h).tolist())
print('error = ', np.array(error).tolist())
# 返回训练好的 回归系数
return weights
dataArr, labelMat = loadDataSet()
# 获得系数(回归参数),是一个向量
weights = gradAscent(dataArr, labelMat)
weights
h = [[0.010453885014499422], [0.6403882362951788], [0.4327429963253981], [0.28437375674838095], [0.07653053217649868], [0.4898566481690915], [0.031887875302845], [0.2154422629207751], [0.18360634799427475], [0.08948867854479377], [0.5653556608497552], [0.033747224345025965], [0.938952938925223], [0.0876783009394775], [0.5756857797631882], [0.9730877911836482], [0.5785968469585414], [0.9201549201878283], [0.9770770865159203], [0.7427525170744756], [0.9561362182077201], [0.9747142536136103], [0.03178037119825938], [0.9932343839182572], [0.8106245265590181], [0.22957520967818754], [0.13705259762855473], [0.9667783715977883], [0.8909122561418292], [0.24433184418249274], [0.8140133131188175], [0.32623784257766086], [0.08059730935331155], [0.9930164521965504], [0.8378635816951857], [0.16381661895842392], [0.19683226621949174], [0.0648812809927683], [0.016268017147865063], [0.031701049670393944], [0.8669635425376012], [0.7021240156133962], [0.07322381465687249], [0.9870723238676774], [0.9462756710677251], [0.04853496326192164], [0.9941064368848221], [0.6411645355848413], [0.030391882773215962], [0.08171567096602954], [0.23747039963667427], [0.082032180435612], [0.2616363887383896], [0.018943829672773426], [0.8512723603033551], [0.48870459650927567], [0.0659417900659263], [0.5546968995043094], [0.030887956316657812], [0.7349249596325498], [0.7645522804010193], [0.9588101010649962], [0.1858501178320481], [0.032878720625468344], [0.03608130756704987], [0.9903907290376559], [0.7929044763657996], [0.08224316693067246], [0.044241989574433004], [0.045578898453716744], [0.06669781689137282], [0.9604686679201407], [0.011976782445359572], [0.9070324010322088], [0.15121501368087523], [0.4851168018203232], [0.9658417472788242], [0.9681889536506088], [0.8821391488948144], [0.7339719646582871], [0.4156910643083357], [0.9961870325644673], [0.20672503984599824], [0.5953041577919764], [0.9277436098558537], [0.9917474025737162], [0.7927990454388322], [0.04778261563345348], [0.8307601289214341], [0.984039965184515], [0.990092647903256], [0.055827614186581824], [0.14291222058807557], [0.9737296358897077], [0.9544447671471007], [0.9463445570509886], [0.10852691844787676], [0.9521946770859797], [0.2745579871301019], [0.008061542683973575]] error = [[-0.010453885014499422], [0.3596117637048212], [-0.4327429963253981], [-0.28437375674838095], [-0.07653053217649868], [0.5101433518309084], [-0.031887875302845], [0.7845577370792249], [-0.18360634799427475], [-0.08948867854479377], [0.4346443391502448], [-0.033747224345025965], [0.061047061074776976], [-0.0876783009394775], [0.42431422023681176], [0.026912208816351813], [0.4214031530414586], [0.0798450798121717], [0.02292291348407971], [0.25724748292552435], [0.0438637817922799], [0.025285746386389696], [-0.03178037119825938], [0.0067656160817427535], [0.18937547344098193], [-0.22957520967818754], [-0.13705259762855473], [0.033221628402211656], [0.10908774385817077], [-0.24433184418249274], [0.18598668688118247], [0.6737621574223391], [-0.08059730935331155], [0.0069835478034495635], [0.1621364183048143], [-0.16381661895842392], [-0.19683226621949174], [-0.0648812809927683], [-0.016268017147865063], [-0.031701049670393944], [0.13303645746239878], [0.2978759843866038], [-0.07322381465687249], [0.012927676132322596], [0.053724328932274856], [-0.04853496326192164], [0.005893563115177858], [0.3588354644151587], [-0.030391882773215962], [-0.08171567096602954], [-0.23747039963667427], [-0.082032180435612], [-0.2616363887383896], [-0.018943829672773426], [0.14872763969664493], [0.5112954034907243], [-0.0659417900659263], [0.44530310049569055], [-0.030887956316657812], [0.2650750403674502], [0.2354477195989807], [0.0411898989350038], [-0.1858501178320481], [-0.032878720625468344], [-0.03608130756704987], [0.009609270962344096], [0.20709552363420036], [-0.08224316693067246], [-0.044241989574433004], [-0.045578898453716744], [-0.06669781689137282], [0.03953133207985926], [-0.011976782445359572], [0.09296759896779117], [-0.15121501368087523], [-0.4851168018203232], [0.034158252721175786], [0.03181104634939125], [0.11786085110518563], [0.26602803534171293], [-0.4156910643083357], [0.0038129674355327214], [-0.20672503984599824], [0.40469584220802357], [0.07225639014414631], [0.008252597426283814], [0.20720095456116783], [-0.04778261563345348], [0.16923987107856586], [0.015960034815485025], [0.009907352096743982], [-0.055827614186581824], [-0.14291222058807557], [0.026270364110292266], [0.04555523285289931], [0.05365544294901137], [-0.10852691844787676], [0.04780532291402029], [-0.2745579871301019], [-0.008061542683973575]]
matrix([[ 4.1241435695], [ 0.4800732934], [-0.6168482053]])
def plotBestFit(dataMat1,labelMat1,weights):
import matplotlib.pyplot as plt
# dataArr, labelMat = loadDtataSet()
# weights = gradAscent(dataArr, labelMat)
dataArr = np.array(dataMat1)
#print('dataArr = ', dataArr)
n = np.shape(dataArr)[0]
#print('n = ', n)
xcord1 = []; ycord1 = [];
xcord2 = []; ycord2 = [];
for i in range(n):
if int(labelMat[i]==1):
xcord1.append(dataArr[i,1])
ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1])
ycord2.append(dataArr[i,2])
#print(xcord2, '|', ycord2)
#print(xcord1, '|', ycord1)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1,ycord1,s=30,c='red',marker='s')
ax.scatter(xcord2,ycord2,s=30,c='green')
# X轴的数目
x = np.arange(-2.5,2.5,0.1)
# Y轴的数目,也就是保证与X轴的数目相同
x_y = len(x)
# ❶ 最佳拟合直线
# 对于 Sigmoid函数,0 是两个分类(0和1)的分界处
# 所以设定 0 = w0x0 + w1x1 + w2x2
# 即,先移项,再等式两边除以 w2
# 然后解出X2和X1的关系式(分隔线的方程,注意X0=1)
y1 = (-weights[0]-weights[1]*x)/weights[2]
#print('weights= ', weights[0],'|',weights[1],'|',weights[2])
#print('y1= ', y1)
y = y1.reshape(x_y,1)
#print('y= ', y)
ax.plot(x,y)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
plotBestFit(np.array(dataArr),np.array(labelMat),weights)
def stocGradAscent0(dataMatrix, classLabels):
m,n = shape(dataMatrix)
alpha = 0.01
weights = ones(n)
y1 = []
y2 = []
y3 = []
# for k in range(200):
for i in range(m):
# 梯度上升算法是:h = sigmoid(dataMatrix * weights)
# 与随机梯度算法不一样
# print('=================【 ', i, ' 】================')
# print('dataMatrix[i]*weights = ', dataMatrix[i]*weights)
# print('sum(dataMatrix[i]*weights) = ', sum(dataMatrix[i]*weights))
h = sigmoid(sum(dataMatrix[i]*weights))
# print('dataMatrix[i] = ', dataMatrix[i])
# print('weights = ', weights)
# print('h = ', h)
error = classLabels[i] - h
# print('classLabels[i] = ', classLabels[i])
# print('error = ', error)
# print('alpha*error*dataMatrix[i] = ', alpha*error*dataMatrix[i])
weights = weights + alpha*error*dataMatrix[i]
y1.append(weights[0])
y2.append(weights[1])
y3.append(weights[2])
return weights,y1,y2,y3
dataArr2, labelMat2 = loadDataSet()
weights2,y21,y22,y23 = stocGradAscent0(array(dataArr2), labelMat2)
plotBestFit(np.array(dataArr2),np.array(labelMat2),weights2)
def stocGradAscent1_b(dataMatrix, classLabels, numIter=1):
m,n = shape(dataMatrix)
weights = ones(n)
y1 = []
y2 = []
y3 = []
for j in range(numIter):
data_index = np.array(range(m)).tolist()
for i in range(m):
# 每次调整alpha的值
alpha = 4 / (1.0 + j + i) + 0.0001
# 随机选取样本来更新回归系数
# 目的是减少周期性的波动
rand_index = int(random.uniform(0, len(data_index)))
h = sigmoid(sum(dataMatrix[rand_index]*weights))
error = classLabels[rand_index] - h
weights = weights + alpha*error*dataMatrix[rand_index]
y1.append(weights[0])
y2.append(weights[1])
y3.append(weights[2])
return weights,y1,y2,y3
weights3,y1,y2,y3 = stocGradAscent1_b(array(dataArr2), labelMat2)
plotBestFit(np.array(dataArr2),np.array(labelMat2),weights3)
def plotBestFit2(y1,y2,y3):
import matplotlib.pyplot as plt
# X轴的数目
x = np.arange(0,len(y1),1)
fig = plt.figure(figsize=(16,8))
ax = fig.add_subplot(111)
# ❶ 最佳拟合直线
ax.plot(x,y1,'g')
ax.plot(x,y2,'r')
ax.plot(x,y3,'y')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()
plotBestFit2(y21,y22,y23)
plotBestFit2(y1,y2,y3)
import numpy as np
import random
import warnings
#suppress warnings
warnings.filterwarnings('ignore')
# sigmoid阶跃函数
# def sigmoid(inX):
# # return 1.0 / (1 + exp(-inX))
# return 1.0/(1+np.exp(-inX)+1e-8)
def sigmoid(x):
if x>=0:
return 1.0/(1+np.power(np.e,-x))
else:#为负数的时候,对sigmoid函数的优化,避免了出现极大的数据溢出
return np.power(np.e,x)/(1+np.power(np.e,x))
# 随机梯度上升算法(随机化)
def stocGradAscent1(dataMatIn, classLabels, numIter=150):
m,n = np.shape(dataMatIn)
# 创建与列数相同的矩阵的系数矩阵,1行3列
weights = np.ones(n)
# 随机梯度, 循环150,观察是否收敛
for j in range(numIter):
# [0, 1, 2 .. m-1]
dataIndex = []
for r in range(m):
dataIndex.append(r)
for i in range(m):
# i和j的不断增大,导致alpha的值不断减少,但是不为0
alpha = 4/(1.0+j+i)+0.0001 # alpha 会随着迭代不断减小,但永远不会减小到0,因为后边还有一个常数项0.0001
# 随机产生一个 0~len()之间的一个值
# random.uniform(x, y) 方法将随机生成下一个实数,它在[x,y]范围内,x是这个范围内的最小值,y是这个范围内的最大值。
randIndex = int(random.uniform(0,len(dataIndex)))
# sum(dataMatrix[randIndex]*weights)为了求 f(x)的值, f(x)=a1*x1+b2*x2+..+nn*xn
h = sigmoid(sum(dataMatIn[dataIndex[randIndex]]*weights))
error = classLabels[dataIndex[randIndex]] - h
weights = weights + alpha * error * dataMatIn[dataIndex[randIndex]]
del(dataIndex[randIndex])
return weights
# 分类函数,根据回归系数和特征向量来计算 Sigmoid的值
def classifyVector(inX, weights):
'''
Desc:
最终的分类函数,根据回归系数和特征向量来计算 Sigmoid 的值,大于0.5函数返回1,否则返回0
Args:
inX -- 特征向量,features
weights -- 根据梯度下降/随机梯度下降 计算得到的回归系数
Returns:
如果 prob 计算大于 0.5 函数返回 1
否则返回 0
'''
prob = sigmoid(sum(inX*weights))
if prob > 0.5: return 1.0
else: return 0.0
# 打开测试集和训练集,并对数据进行格式化处理
def colicTest():
'''
Desc:
打开测试集和训练集,并对数据进行格式化处理
Args:
None
Returns:
errorRate -- 分类错误率
'''
frTrain = open('rawdata/ch05/horseColicTraining.txt')
frTest = open('rawdata/ch05/horseColicTest.txt')
trainingSet = []
trainingLabels = []
# 解析训练数据集中的数据特征和Labels
# trainingSet 中存储训练数据集的特征
# trainingLabels 存储训练数据集的样本对应的分类标签
for line in frTrain.readlines():
currLine = line.strip().split('\t')
lineArr = []
# 训练集中有21个特征列,所以使用range(21)
for i in range(21):
lineArr.append(float(currLine[i]))
# 添加特征值到list中
trainingSet.append(lineArr)
# 添加标签类别到list中
trainingLabels.append(float(currLine[21]))
# 使用 改进后的 随机梯度下降算法 求得在此数据集上的最佳回归系数 trainWeights
trainWeights = stocGradAscent1(np.array(trainingSet), trainingLabels, 500)
errorCount = 0
numTestVec = 0.0
# 读取 测试数据集 进行测试,计算分类错误的样本条数和最终的错误率
for line in frTest.readlines():
# 总次数(样本数)累加
numTestVec += 1.0
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
if int(classifyVector(np.array(lineArr), trainWeights)) != int(currLine[21]):
errorCount += 1
errorRate = (float(errorCount) / numTestVec)
print ("the error rate of this test is: %f" % errorRate)
return errorRate
# 调用 colicTest() 10次并求结果的平均值
def multiTest():
numTests = 10
errorSum = 0.0
for k in range(numTests):
errorSum += colicTest()
print ("after %d iterations the average error rate is: %f" % (numTests, errorSum/float(numTests)))
multiTest()
the error rate of this test is: 0.253731 the error rate of this test is: 0.343284 the error rate of this test is: 0.298507 the error rate of this test is: 0.283582 the error rate of this test is: 0.298507 the error rate of this test is: 0.298507 the error rate of this test is: 0.313433 the error rate of this test is: 0.358209 the error rate of this test is: 0.388060 the error rate of this test is: 0.298507 after 10 iterations the average error rate is: 0.313433