from numpy import *
import numpy as np
import re
import operator
# 词汇表到向量的转换函数
def load_data_set():
# 第一个返回值是进行词条切分后的文档集合
# 这些示例文档来自留言板
# 每一条留言文本被切分成一系列的词条集合(标点符号从文本中去掉)
posting_list = [
['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']
]
# 第二个返回值是一个类别标签的集合
# 这些文本的标签由人工标注
# 这些标注信息用于训练程序以便自动检测侮辱性留言
# 注意:列表的长度,等于第一个返回值的列表长度
# 1 代表侮辱性文字, 0 代表正常言论
class_vec = [0, 1, 0, 1, 0, 1]
return posting_list, class_vec
listOPosts, listClasses = load_data_set()
print('listClasses is: ', listClasses)
listOPosts
listClasses is: [0, 1, 0, 1, 0, 1]
[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
# 创建一个包含在所有文档中出现的词条的不重复的列表
# 作用是:
def create_vocab_list(data_set):
# 创建一个空集
vocab_set = set([])
for document in data_set:
# | 两个集合的并集
vocab_set = vocab_set | set(document)
# 创建两个集合的并集
return list(vocab_set)
myVocabList = create_vocab_list(listOPosts)
print(myVocabList)
['please', 'how', 'dalmation', 'him', 'help', 'garbage', 'so', 'ate', 'I', 'to', 'stupid', 'licks', 'posting', 'steak', 'problems', 'cute', 'buying', 'has', 'food', 'worthless', 'dog', 'quit', 'park', 'flea', 'not', 'my', 'maybe', 'stop', 'is', 'take', 'mr', 'love']
# 输入参数是是两个列表
# 一个是在所有文档中出现的词条的不重复的列表
# 另一个是某个文档(一条留言记录的词汇向量)
# 返回一个向量,相对于不重复词条向量的类别向量,其中 0表示未出现,1表示出现
# 0或1的设置,取决于词汇在不重复列表中的位置
def set_of_word_vec(vocab_list, input_set):
# 创建一个所含向量都为0的向量
vec = [0] * len(vocab_list)
for word in input_set:
# 如果数据集中的词汇出现在文档中
if word in vocab_list:
# 先找出wird在vocab_list中的索引号
# 然后将vec对应的索引号置为1
vec[vocab_list.index(word)] = 1
else:
print("The word:%s is not in my vocalulary!" % word)
return vec
print('myVocabList is: ', myVocabList, '\n')
print('listOPosts[0] is: ', listOPosts[0], '\n')
t = set_of_word_vec(myVocabList, listOPosts[0])
print(t)
myVocabList is: ['please', 'how', 'dalmation', 'him', 'help', 'garbage', 'so', 'ate', 'I', 'to', 'stupid', 'licks', 'posting', 'steak', 'problems', 'cute', 'buying', 'has', 'food', 'worthless', 'dog', 'quit', 'park', 'flea', 'not', 'my', 'maybe', 'stop', 'is', 'take', 'mr', 'love'] listOPosts[0] is: ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'] [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0]
print(listOPosts[3])
t = set_of_word_vec(myVocabList, listOPosts[3])
print(t)
['stop', 'posting', 'stupid', 'worthless', 'garbage'] [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
- 伪代码
- 计算每个类别中的文档数目
- 对每篇训练文档
- 对每个类别
- 如果词条出现文档中 -> 增加该词条的计数值
- 增加所有词条的计数值
- 对每个类别
- 对每个词条
- 将该词条的数目除以总词条数目得到条件概率
- 返回每个类别的条件概率
# 定义贝叶斯算法
# 传入参数
# trainMatrix,文档矩阵 列表,每个元素也是列表(由词条组成的列表)
# trainCategory, 每篇文档类别标签所构成的向量列表
def trainNB0(trainMatrix, trainCategory):
# 训练集的长度(留言条数)
numTrainDocs = len(trainMatrix)
print('numTrainDocs is: ', numTrainDocs)
# 训练集中第一个文档的长度(词条个数)
numWords = len(trainMatrix[0])
print('numWords is: ', numWords)
# 类别向量为1的次数(代表侮辱性出现次数) / 留言条数
# 分母转化为浮点数
pAbusive = sum(trainCategory) / float(numTrainDocs)
# 新词条0向量,存储0
p0Num = zeros(numWords)
# 存储1
p1Num = zeros(numWords)
p0Denom = 0.0
p1Denom = 0.0
# 循环次数:留言条数的数目
for i in range(numTrainDocs):
print('i is: ', i)
print('trainCategory[i] is: ', trainCategory[i])
if trainCategory[i] == 1:
print('---->trainMatrix[i] is: ', trainMatrix[i])
print('---->sum(trainMatrix[i]) is: ', sum(trainMatrix[i]))
# 列表相加,就是其中各个元素的相加
p1Num += trainMatrix[i]
# Denom,即分母,是计数的累加
# 也等于p1Num中各个元素的和
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
print('-'*70)
print('p0Num is: ', p0Num)
print('p1Num is: ', p1Num)
print('p0Denom is: ',p0Denom)
print('p1Denom is: ',p1Denom)
# p1向量的比率
p1Vect = p1Num / p1Denom
p0Vect = p0Num / p0Denom
return p0Vect, p1Vect, pAbusive
# 这一步是使用set_of_word_vec函数,把文档对应到词汇表中,生成一个向量
# 这个向量与词汇表等长,其元素如果有出现则为1,否则为0
# 这个向量的作用是计算概率
trainMat = []
for postinDoc in listOPosts:
trainMat.append(set_of_word_vec(myVocabList, postinDoc))
print('listOPosts is: ', listOPosts)
print('trainMat is: ', trainMat)
listOPosts is: [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] trainMat is: [[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0], [0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
p0v, p1v, pAb = trainNB0(trainMat, listClasses)
numTrainDocs is: 6 numWords is: 32 i is: 0 trainCategory[i] is: 0 ---------------------------------------------------------------------- i is: 1 trainCategory[i] is: 1 ---->trainMatrix[i] is: [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0] ---->sum(trainMatrix[i]) is: 8 ---------------------------------------------------------------------- i is: 2 trainCategory[i] is: 0 ---------------------------------------------------------------------- i is: 3 trainCategory[i] is: 1 ---->trainMatrix[i] is: [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] ---->sum(trainMatrix[i]) is: 5 ---------------------------------------------------------------------- i is: 4 trainCategory[i] is: 0 ---------------------------------------------------------------------- i is: 5 trainCategory[i] is: 1 ---->trainMatrix[i] is: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ---->sum(trainMatrix[i]) is: 6 ---------------------------------------------------------------------- p0Num is: [1. 1. 1. 2. 1. 0. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 3. 0. 1. 1. 0. 1. 1.] p1Num is: [0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 3. 0. 1. 0. 0. 0. 1. 0. 1. 2. 2. 1. 1. 0. 1. 0. 1. 1. 0. 1. 0. 0.] p0Denom is: 24.0 p1Denom is: 19.0
p0v
array([0.04166667, 0.04166667, 0.04166667, 0.08333333, 0.04166667,
0. , 0.04166667, 0.04166667, 0.04166667, 0.04166667,
0. , 0.04166667, 0. , 0.04166667, 0.04166667,
0.04166667, 0. , 0.04166667, 0. , 0. ,
0.04166667, 0. , 0. , 0.04166667, 0. ,
0.125 , 0. , 0.04166667, 0.04166667, 0. ,
0.04166667, 0.04166667])
p1v
array([0. , 0. , 0. , 0.05263158, 0. ,
0.05263158, 0. , 0. , 0. , 0.05263158,
0.15789474, 0. , 0.05263158, 0. , 0. ,
0. , 0.05263158, 0. , 0.05263158, 0.10526316,
0.10526316, 0.05263158, 0.05263158, 0. , 0.05263158,
0. , 0.05263158, 0.05263158, 0. , 0.05263158,
0. , 0. ])
pAb
0.5
# 1代表侮辱性词汇
# 先取出现频率最高的值
# 获取出现频率最高值对应的索引号
# 将索引号用于词汇表,找出该词本身
myVocabList[np.where(p1v==p1v.max())[0][0]]
'stupid'
myVocabList[np.where(p0v==p0v.max())[0][0]]
'my'
# 定义贝叶斯算法
# 传入参数
# trainMatrix,文档矩阵 列表,每个元素也是列表(由词条组成的列表)
# trainCategory, 每篇文档类别标签所构成的向量列表
def trainNB0(trainMatrix, trainCategory):
# 训练集的长度(留言条数)
numTrainDocs = len(trainMatrix)
# print('numTrainDocs is: ', numTrainDocs)
# 训练集中第一个文档的长度(词条个数)
numWords = len(trainMatrix[0])
# print('numWords is: ', numWords)
# 类别向量为1的次数(代表侮辱性出现次数) / 留言条数
# 分母转化为浮点数
pAbusive = sum(trainCategory) / float(numTrainDocs)
# 新词条0向量,存储0
p0Num = ones(numWords)
# 存储1
p1Num = ones(numWords)
p0Denom = 2.0
p1Denom = 2.0
# 循环次数:留言条数的数目
for i in range(numTrainDocs):
# print('i is: ', i)
# print('trainCategory[i] is: ', trainCategory[i])
if trainCategory[i] == 1:
# print('---->trainMatrix[i] is: ', trainMatrix[i])
# print('---->sum(trainMatrix[i]) is: ', sum(trainMatrix[i]))
# # 列表相加,就是其中各个元素的相加
# Denom,即分母,是计数的累加
# 也等于p1Num中各个元素的和
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
# print('-'*70)
# print('p0Num is: ', p0Num)
# print('p1Num is: ', p1Num)
# print('p0Denom is: ',p0Denom)
# print('p1Denom is: ',p1Denom)
# p1向量的比率
p1Vect = log(p1Num / p1Denom)
p0Vect = log(p0Num / p0Denom)
return p0Vect, p1Vect, pAbusive
p0v, p1v, pAb = trainNB0(trainMat, listClasses)
p0v
array([-2.56494936, -2.56494936, -2.56494936, -2.15948425, -2.56494936,
-3.25809654, -2.56494936, -2.56494936, -2.56494936, -2.56494936,
-3.25809654, -2.56494936, -3.25809654, -2.56494936, -2.56494936,
-2.56494936, -3.25809654, -2.56494936, -3.25809654, -3.25809654,
-2.56494936, -3.25809654, -3.25809654, -2.56494936, -3.25809654,
-1.87180218, -3.25809654, -2.56494936, -2.56494936, -3.25809654,
-2.56494936, -2.56494936])
p1v
array([-3.04452244, -3.04452244, -3.04452244, -2.35137526, -3.04452244,
-2.35137526, -3.04452244, -3.04452244, -3.04452244, -2.35137526,
-1.65822808, -3.04452244, -2.35137526, -3.04452244, -3.04452244,
-3.04452244, -2.35137526, -3.04452244, -2.35137526, -1.94591015,
-1.94591015, -2.35137526, -2.35137526, -3.04452244, -2.35137526,
-3.04452244, -2.35137526, -2.35137526, -3.04452244, -2.35137526,
-3.04452244, -3.04452244])
pAb
0.5
def classifyNB(
# 准备判断类别的文档(词汇列表)
vec2Classify
# 分类器的概率,是一个向量
# 0非侮辱性,
, p0Vec
# 侮辱性
, p1Vec
# 总体概率
, pClass1
):
# 相乘的两个向量是等长的,相乘把不存在的元素都置0
# 使用 sum 将所有元素相加(log相加等于原值相乘)
# 再加上 一个log的尾数(总体概率,pClass1, pClass0)
# print('>------------------------------------------>pClass1 is: ', pClass1)
p1 = sum(vec2Classify * p1Vec) + log(pClass1)
# 加了一个 1e-5,是一个极小的数字,以防 pClass1 = 1 时,这里 log(0) 报错
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1 + 1e-5)
# print('>>>>>>>>>>---------->p1 is: ', p1)
# print('>>>>>>>>>>---------->p0 is: ', p0)
if p1 > p0:
return 1
else:
return 0
def testingNB():
listOPosts, listClasses = load_data_set()
myVocabList = create_vocab_list(listOPosts)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(set_of_word_vec(myVocabList, postinDoc))
p0v, p1v, pAb = trainNB0(array(trainMat), array(listClasses))
testEntry = ['love', 'my', 'dalmation']
thisDoc = array(set_of_word_vec(myVocabList, testEntry))
print(testEntry, 'classified as: ', classifyNB(thisDoc, p0v, p1v, pAb), '\n')
testEntry = ['stupid', 'garbage']
thisDoc = array(set_of_word_vec(myVocabList, testEntry))
print(testEntry, 'classified as: ', classifyNB(thisDoc, p0v, p1v, pAb), '\n')
testingNB()
['love', 'my', 'dalmation'] classified as: 0 ['stupid', 'garbage'] classified as: 1
def bag_of_word_vec(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.'
print(mySent.split())
['This', 'book', 'is', 'the', 'best', 'book', 'on', 'Python', 'or', 'M.L.', 'I', 'have', 'ever', 'laid', 'eyes', 'upon.']
# 使用正则表达式切分,其中分隔符是除单词、数字以外的任意字符串
import re
mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.'
# \w:用于匹配字母,数字或下划线字符;
# \W:用于匹配所有与\w不匹配的字符;
regRx = re.compile('\\W+')
listOfTokens = regRx.split(mySent)
print(listOfTokens)
['This', 'book', 'is', 'the', 'best', 'book', 'on', 'Python', 'or', 'M', 'L', 'I', 'have', 'ever', 'laid', 'eyes', 'upon', '']
# 去除空字符串
# 全部换成小写字母
print([tok.lower() for tok in listOfTokens if len(tok)>0])
['this', 'book', 'is', 'the', 'best', 'book', 'on', 'python', 'or', 'm', 'l', 'i', 'have', 'ever', 'laid', 'eyes', 'upon']
emailText = open('rawdata/ch04/email/ham/5.txt', 'rb').read()
emailText
listOfTokens = regRx.split(emailText.decode('utf8'))
print([tok.lower() for tok in listOfTokens if len(tok)>0])
['there', 'was', 'a', 'guy', 'at', 'the', 'gas', 'station', 'who', 'told', 'me', 'that', 'if', 'i', 'knew', 'mandarin', 'and', 'python', 'i', 'could', 'get', 'a', 'job', 'with', 'the', 'fbi']
emailText = open('rawdata/ch04/email/spam/2.txt', 'rb').read()
emailText
listOfTokens = regRx.split(emailText.decode('utf8'))
print([tok.lower() for tok in listOfTokens if len(tok)>0])
['hydrocodone', 'vicodin', 'es', 'brand', 'watson', 'vicodin', 'es', '7', '5', '750', 'mg', '30', '195', '120', '570', 'brand', 'watson', '7', '5', '750', 'mg', '30', '195', '120', '570', 'brand', 'watson', '10', '325', 'mg', '30', '199', '120', '588', 'noprescription', 'required', 'free', 'express', 'fedex', '3', '5', 'days', 'delivery', 'for', 'over', '200', 'order', 'major', 'credit', 'cards', 'e', 'check']
listOfTokens = regRx.split(emailText.decode('utf8'))
print([tok.lower() for tok in listOfTokens if len(tok)>0])
['hydrocodone', 'vicodin', 'es', 'brand', 'watson', 'vicodin', 'es', '7', '5', '750', 'mg', '30', '195', '120', '570', 'brand', 'watson', '7', '5', '750', 'mg', '30', '195', '120', '570', 'brand', 'watson', '10', '325', 'mg', '30', '199', '120', '588', 'noprescription', 'required', 'free', 'express', 'fedex', '3', '5', 'days', 'delivery', 'for', 'over', '200', 'order', 'major', 'credit', 'cards', 'e', 'check']
# 切割分类文本
def text_parse(big_string):
list_of_tokens = re.split('\W+', big_string)
# 去掉了少于2个字符的字符串,所有大写转小写
return [tok.lower() for tok in list_of_tokens if len(tok) > 2]
# tips
t =[]
a = [1,2,3]
b = [4,5,6]
t.append(a)
t.append(b)
print(t)
t = []
a = [1,2,3]
b = [4,5,6]
t.extend(a)
t.extend(b)
print(t)
[[1, 2, 3], [4, 5, 6]] [1, 2, 3, 4, 5, 6]
# tips
train_set_t = list(range(5))
print(train_set_t)
train_set_t = range(5)
print(train_set_t)
[0, 1, 2, 3, 4] range(0, 5)
# 垃圾邮件检测
def spam_text():
doc_list = []
full_text = []
class_list = []
# 导入并且解析文本
for i in range(1, 26):
# 先把spam目录下的所有文件切分、加入到 doc_list 中
word_list = text_parse(open('rawdata/ch04/email/spam/%d.txt' % i).read())
# 生成带有列表作为元素的列表
# 每一篇文档作为一个元素
# 列表长度 代表 文档数目
doc_list.append(word_list)
# extend方法会在已存在的列表中添加新的列表内容
# 所有的词条全部加入到 full_text 列表中
full_text.extend(word_list)
# spam文档加一个 1
class_list.append(1)
word_list = text_parse(open('rawdata/ch04/email/ham/%d.txt' % i).read())
doc_list.append(word_list)
full_text.extend(word_list)
# ham文档加一个 0
class_list.append(0)
# create_vocab_list 返回一个不重复的词条列表
vocab_list = create_vocab_list(doc_list)
# print('len(vocab_list) is: ', len(vocab_list))
# 备注:需要加入list()方可进行运算
# 训练集,包含50个数目的列表(本例中的文件数目是50个)
# 所以 上一个步骤的 class_list 的长度,也应该等于 50,其中25个1,25个0
train_set = list(range(50))
# 测试集,是一个空列表
test_set = []
# 随机地 构建训练集
# 随机选择为测试集的数目,设定为 10 个
for i in range(10):
# random.uniform(参数1,参数2) 返回参数1和参数2之间的任意值
# 1 到 50 之间的任意数字
rand_index = int(random.uniform(1, len(train_set)))
# print('rand_index is: ', rand_index)
# 将锁机选中的记录,添加到测试集中
test_set.append(train_set[rand_index])
# print('test_set is: ', test_set)
# print('train_set is: ', train_set)
# print('train_set[rand_index] is: ', train_set[rand_index])
# 然后在测试集中删除
del(train_set[rand_index])
# 声明空列表
#
train_matrix = []
train_class = []
# 对训练集进行分类
# 注意,循环的是索引号
for doc_index in train_set:
# 得到一个训练集向量矩阵
train_matrix.append(
# 返回的是一个 0/1 的向量
# 如果doc_list[doc_index]中的词条,存在于vocab_list之中
# 则置为 1,否则为 0
bag_of_word_vec(
vocab_list
, doc_list[doc_index]
)
)
# print('len(train_matrix[0]) is: ', len(train_matrix[0]))
# print('len(train_matrix) is: ', len(train_matrix))
# 相对应的类别号(0、1)
train_class.append(class_list[doc_index])
# 计算得到概率值
# 如前,计算方法是:长度为全部词条数目的向量,累加,与总数 相除,得到向量的比率(概率)
# 为了避免下一步操作(连乘)出现0,导致结果为0,使用了log后相加的方法
p0_vec, p1_vec, p_spam = trainNB0(
array(train_matrix)
, array(train_class)
)
error_count = 0
# 开始对测试集的数据进行分类
for doc_index in test_set:
print('doc_index is: ', doc_index)
word_vector = bag_of_word_vec(
vocab_list
, doc_list[doc_index]
)
# class_list[doc_index] 是测试集中,真实的类别
# classifyNB() 得到的是分类器预测的结果
# 也就是 将待预测的向量word_vector 与 分类器的概率 相乘,然后计算返回的是 p1还是p0
if classifyNB(array(word_vector), p0_vec, p1_vec, p_spam) != class_list[doc_index]:
error_count += 1
print("classification error", doc_list[doc_index])
print('-'*75)
print('>---------------------------------------------------------------->the error rate is: ', float(error_count) / len(test_set))
spam_text()
doc_index is: 37 doc_index is: 25 doc_index is: 30 doc_index is: 2 doc_index is: 11 doc_index is: 40 doc_index is: 49 doc_index is: 43 doc_index is: 16 doc_index is: 32 classification error ['home', 'based', 'business', 'opportunity', 'knocking', 'your', 'door', 'don', 'rude', 'and', 'let', 'this', 'chance', 'you', 'can', 'earn', 'great', 'income', 'and', 'find', 'your', 'financial', 'life', 'transformed', 'learn', 'more', 'here', 'your', 'success', 'work', 'from', 'home', 'finder', 'experts'] --------------------------------------------------------------------------- >---------------------------------------------------------------->the error rate is: 0.1