# 拆分数据集为训练集和测试集
from sklearn.model_selection import train_test_split
# 导入 K近邻分类器库,该算法默认选择5个近邻作为分类依据
# 可大体上保证切分后得到的子数据集中类别分布相同,以避免某些子数据集出现类别分布失衡的情况
from sklearn.neighbors import KNeighborsClassifier
# 导入 交叉检验方法库, 默认使用 Stratified K Fold 方法切分数据集
from sklearn.model_selection import cross_val_score
# 可用 MinMaxScaler 类进行基于特征的规范化
from sklearn.preprocessing import MinMaxScaler
# 用于创建流水线
from sklearn.pipeline import Pipeline
%matplotlib inline
from matplotlib import pyplot as plt
import os
import numpy as np
import csv
data_folder = os.path.expanduser("~")
# 组合文件路径,不过本文使用相对路径,所以注释掉
# data_filename = os.path.join(data_folder, "03_DataMiner/rawdata","ionosphere.data")
data_filename = 'rawdata/ionosphere.data'
X = np.zeros((351, 34), dtype='float')
y = np.zeros((351,), dtype='bool')
X
array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
y[:5]
array([False, False, False, False, False])
with open(data_filename, 'r') as input_file:
reader = csv.reader(input_file)
# 遍历文件中的每一行数据
for i, row in enumerate(reader):
# 获取每一个个体的前34个值,将其强制转化为浮点型
data = [float(datum) for datum in row[:-1]]
X[i] = data
# 获取每个个体最后一个表示类别的值,把字母转化为数字,如果类别为“g”,值为1,否则值为0
y[i] = row[-1] == 'g'
X
array([[ 1. , 0. , 0.99539, ..., -0.54487, 0.18641, -0.453 ],
[ 1. , 0. , 1. , ..., -0.06288, -0.13738, -0.02447],
[ 1. , 0. , 1. , ..., -0.2418 , 0.56045, -0.38238],
...,
[ 1. , 0. , 0.94701, ..., 0.00442, 0.92697, -0.00577],
[ 1. , 0. , 0.90608, ..., -0.03757, 0.87403, -0.16243],
[ 1. , 0. , 0.8471 , ..., -0.06678, 0.85764, -0.06151]])
y[:5]
array([ True, False, True, False, True])
X_train, X_test, y_train, y_test = train_test_split(
X
, y
, random_state=14
)
print("There are {} training samples".format(y_train.shape))
print("There are {} testing samples".format(y_test.shape))
There are (263,) training samples There are (88,) testing samples
X_train
array([[ 0. , 0. , 1. , ..., 0. , 0. , 0. ],
[ 1. , 0. , 1. , ..., -0.26284, 0.64207, -0.39487],
[ 0. , 0. , 0. , ..., 0. , 0. , 0. ],
...,
[ 1. , 0. , 1. , ..., 0.18897, 0.56167, 0.1518 ],
[ 1. , 0. , 0.76046, ..., 0.01484, 0.63887, 0.01525],
[ 1. , 0. , 0.99449, ..., -0.25741, 0.76586, -0.27794]])
y_train[:5]
array([False, True, False, False, False])
X_test
array([[ 1. , 0. , 1. , ..., 0.32492, 1. , 0.46712],
[ 1. , 0. , 1. , ..., -0.06288, -0.13738, -0.02447],
[ 1. , 0. , 0.99025, ..., -0.80278, 0.49195, -0.83245],
...,
[ 1. , 0. , 0.83367, ..., -0.51206, 0.64662, -0.30075],
[ 0. , 0. , 1. , ..., -1. , -1. , 1. ],
[ 1. , 0. , 1. , ..., 0.72688, 0.0699 , 0.71444]])
y_test[:5]
array([ True, False, True, True, True])
# 为 K近邻分类器 初始化一个实例
# 其中参数 n_neighbors 表示:选取多少个近邻作为预测依据
estimator = KNeighborsClassifier()
# 用训练数据进行训练
estimator.fit(X_train, y_train)
KNeighborsClassifier()
# 用测试集测试算法,评估它在测试集上的表现
y_predicted = estimator.predict(X_test)
print(y_predicted)
accuracy = np.mean(y_test == y_predicted) * 100
print("The accuracy is {0:.1f}%".format(accuracy))
[ True True True True True False True False True True True True True True False False True False False True True True True False True True True True True True True True True False False True True True True True True False True False True True True True False True True True True True False True True True True True False True True True True True True True False False False True True False True True True False False True False True True False True True False True] The accuracy is 86.4%
# 传入完整的数据集和类别值
scores = cross_val_score(
estimator
, X
, y
, scoring='accuracy'
)
# 取均值
average_accuracy = np.mean(scores) * 100
print("The average accuracy is {0:.1f}%".format(average_accuracy))
The average accuracy is 82.6%
avg_scores = []
all_scores = []
parameter_values = list(range(1, 51)) # Include 50
for n_neighbors in parameter_values:
estimator = KNeighborsClassifier(n_neighbors = n_neighbors)
scores = cross_val_score(
estimator
, X
, y
, scoring='accuracy'
)
avg_scores.append(np.mean(scores))
all_scores.append(scores)
print('avg_scores is: ', avg_scores[:5], '\n')
print('all_scores is: ', all_scores[:5])
avg_scores is: [0.8432997987927566, 0.8718712273641852, 0.8318712273641851, 0.8432997987927567, 0.8261971830985916] all_scores is: [array([0.84507042, 0.78571429, 0.82857143, 0.91428571, 0.84285714]), array([0.84507042, 0.81428571, 0.84285714, 0.94285714, 0.91428571]), array([0.84507042, 0.74285714, 0.78571429, 0.92857143, 0.85714286]), array([0.84507042, 0.77142857, 0.8 , 0.92857143, 0.87142857]), array([0.83098592, 0.77142857, 0.8 , 0.88571429, 0.84285714])]
# n_neighbors 的不同取值和分类正确率之间的关系
# 参数为近邻数和平均正确率
plt.plot(
parameter_values
, avg_scores
, '-o'
)
[<matplotlib.lines.Line2D at 0x7fca3dea27f0>]
X_broken = np.array(X)
X_broken
array([[ 1. , 0. , 0.99539, ..., -0.54487, 0.18641, -0.453 ],
[ 1. , 0. , 1. , ..., -0.06288, -0.13738, -0.02447],
[ 1. , 0. , 1. , ..., -0.2418 , 0.56045, -0.38238],
...,
[ 1. , 0. , 0.94701, ..., 0.00442, 0.92697, -0.00577],
[ 1. , 0. , 0.90608, ..., -0.03757, 0.87403, -0.16243],
[ 1. , 0. , 0.8471 , ..., -0.06678, 0.85764, -0.06151]])
# 每隔一行,就把第二个特征的值除以10
# “::”是用于定义逆序,但是新版python已不使用
X_broken[:,::2] /= 10
X_broken
array([[ 0.1 , 0. , 0.099539, ..., -0.54487 , 0.018641,
-0.453 ],
[ 0.1 , 0. , 0.1 , ..., -0.06288 , -0.013738,
-0.02447 ],
[ 0.1 , 0. , 0.1 , ..., -0.2418 , 0.056045,
-0.38238 ],
...,
[ 0.1 , 0. , 0.094701, ..., 0.00442 , 0.092697,
-0.00577 ],
[ 0.1 , 0. , 0.090608, ..., -0.03757 , 0.087403,
-0.16243 ],
[ 0.1 , 0. , 0.08471 , ..., -0.06678 , 0.085764,
-0.06151 ]])
estimator = KNeighborsClassifier()
original_scores = cross_val_score(
estimator
, X
, y
, scoring='accuracy'
)
print("The original average accuracy for is{0:.1f}%".format(np.mean(original_scores) * 100))
broken_scores = cross_val_score(
estimator
, X_broken
, y
, scoring='accuracy'
)
print("The 'broken' average accuracy for is {0:.1f}%".format(np.mean(broken_scores) * 100))
The original average accuracy for is82.6% The 'broken' average accuracy for is 73.8%
estimator = KNeighborsClassifier()
# MinMaxScaler 直接调用 fit_transform() 函数,即可完成训练和转换
# X_transformed 与 X 行列数相等,为同型矩阵
# X_transformed 每列值的值域为0到1
X_transformed = MinMaxScaler().fit_transform(X_broken)
transformed_scores = cross_val_score(
estimator
, X_transformed
, y
, scoring='accuracy'
)
print("The average accuracy for is {0:.1f}%".format(np.mean(transformed_scores) * 100))
The average accuracy for is 82.9%
# 流水线的核心是 元素为元组的列表
# 每一步都用元组(‘名称’,步骤)来表示
scaling_pipeline = Pipeline(
[
('scale', MinMaxScaler())
, ('predict', KNeighborsClassifier())
]
)
scores = cross_val_score(
scaling_pipeline
, X_broken
, y
, scoring='accuracy'
)
print("The pipeline scored an average accuracy for is {0:.1f}%".format(np.mean(transformed_scores) * 100))
The pipeline scored an average accuracy for is 82.9%