import numpy as np
import pandas as pd
# cross_validatio这个包已经不再使用,划分到model_selection包中
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import defaultdict
from operator import itemgetter
import warnings
from sklearn.datasets import load_iris
dataset = load_iris()
X = dataset.data
y = dataset.target
print('>>>-----> X is: \n', X[:5])
print('>>>-----> Y is: \n', Y)
print('>>>-----> len(Y) is: \n', len(Y))
print('\n dataset.feature_names is: \n', dataset.feature_names)
print('\n dataset.target_names is: \n', dataset.target_names)
>>>-----> X is: [[5.1 3.5 1.4 0.2] [4.9 3. 1.4 0.2] [4.7 3.2 1.3 0.2] [4.6 3.1 1.5 0.2] [5. 3.6 1.4 0.2]] >>>-----> Y is: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2] >>>-----> len(Y) is: 150 dataset.feature_names is: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] dataset.target_names is: ['setosa' 'versicolor' 'virginica']
print('\n===================数据集说明===================\n', dataset.DESCR)
===================数据集说明===================
.. _iris_dataset:
Iris plants dataset
--------------------
**Data Set Characteristics:**
:Number of Instances: 150 (50 in each of three classes)
:Number of Attributes: 4 numeric, predictive attributes and the class
:Attribute Information:
- sepal length in cm
- sepal width in cm
- petal length in cm
- petal width in cm
- class:
- Iris-Setosa
- Iris-Versicolour
- Iris-Virginica
:Summary Statistics:
============== ==== ==== ======= ===== ====================
Min Max Mean SD Class Correlation
============== ==== ==== ======= ===== ====================
sepal length: 4.3 7.9 5.84 0.83 0.7826
sepal width: 2.0 4.4 3.05 0.43 -0.4194
petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)
petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)
============== ==== ==== ======= ===== ====================
:Missing Attribute Values: None
:Class Distribution: 33.3% for each of 3 classes.
:Creator: R.A. Fisher
:Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
:Date: July, 1988
The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken
from Fisher's paper. Note that it's the same as in R, but not as in the UCI
Machine Learning Repository, which has two wrong data points.
This is perhaps the best known database to be found in the
pattern recognition literature. Fisher's paper is a classic in the field and
is referenced frequently to this day. (See Duda & Hart, for example.) The
data set contains 3 classes of 50 instances each, where each class refers to a
type of iris plant. One class is linearly separable from the other 2; the
latter are NOT linearly separable from each other.
.. topic:: References
- Fisher, R.A. "The use of multiple measurements in taxonomic problems"
Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to
Mathematical Statistics" (John Wiley, NY, 1950).
- Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.
(Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.
- Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
Structure and Classification Rule for Recognition in Partially Exposed
Environments". IEEE Transactions on Pattern Analysis and Machine
Intelligence, Vol. PAMI-2, No. 1, 67-71.
- Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE Transactions
on Information Theory, May 1972, 431-433.
- See also: 1988 MLC Proceedings, 54-64. Cheeseman et al"s AUTOCLASS II
conceptual clustering system finds 3 classes in the data.
- Many, many more ...
# 计算均值(得到的数组长度为4,代表特征的数量)
attribute_means = X.mean(axis=0)
print('len(attribute_means) is: ', len(attribute_means))
attribute_means
len(attribute_means) is: 4
array([5.84333333, 3.05733333, 3.758 , 1.19933333])
# 遍历X的每一个元素,大于均值的为1,小于均值的为0
# 即可将连续的特征值转换为类别型
X_d = np.array(X >= attribute_means, dtype='int')
X_d[:5]
array([[0, 1, 0, 0],
[0, 0, 0, 0],
[0, 1, 0, 0],
[0, 1, 0, 0],
[0, 1, 0, 0]])
# 可参考对应的数据观察计算正确与否
X[:5]
array([[5.1, 3.5, 1.4, 0.2],
[4.9, 3. , 1.4, 0.2],
[4.7, 3.2, 1.3, 0.2],
[4.6, 3.1, 1.5, 0.2],
[5. , 3.6, 1.4, 0.2]])
# 设定一个随机状态数,以确保每次随机划分训练集和测试集,都得到相同的划分结果
random_state = 14
X_train, X_test, y_train, y_test = train_test_split(
X_d
, y
, test_size=0.25
, shuffle=True
, random_state=random_state
)
print("There are {} training samples".format(y_train.shape))
print("There are {} testing samples".format(y_test.shape))
There are (112,) training samples There are (38,) testing samples
warnings.filterwarnings("ignore", category=DeprecationWarning)
# 执行调用 函数 train() 的时候,会循环调用,循环的参数就是 feature。
# 循环的方式是:train(X_train, y_train, variable) for variable in range(X_train.shape[1] # variable就是feature 的取值
# 函数内部包含了 value 的循环,也就是使用特征值的取值来做循环
# 两个循环组合而成的嵌套循环,实现的是:每一个特征(4个特征)的每条记录的取值(2个取值),所分别对应着的类别的数目
# 记录(存储)的方式是字典:key是类别,value是上述计算获得的数目
def train(X, y_true, feature):
print('本轮“特征值编号”是:', feature)
print('-'*120)
"""Computes the predictors and error for a given feature using the OneR algorithm
Parameters
----------
X: array [n_samples, n_features]
The two dimensional array that holds the dataset. Each row is a sample, each column is a feature.
y_true: array [n_samples,]
The one dimensional array that holds the class values. Corresponds to X, such that y_true[i] is the class value for sample X[i].
feature: int
An integer corresponding to the index of the variable we wish to test. 0 <= variable < n_features
Returns
-------
predictors: dictionary of tuples: (value, prediction)
For each item in the array, if the variable has a given value, make the given prediction.
error: float
The ratio of training data that this rule incorrectly predicts.
"""
# 先获取训练集的记录数和特征数
n_samples, n_features = X.shape
# Python assert(断言)用于判断一个表达式,在表达式条件为 false 的时候触发异常
# 此处断言:作为参数的特征数,必须大于等于0,且小于训练集的特征数
assert 0 <= feature < n_features
# 获取某一个特征值下的所有记录的(离散值的)取值范围,即找出给定特征共有几种不同的取值
# 然后使用 set 去重
values = set(X[:,feature])
# 存储预测值数组并返回
predictors = dict()
errors = []
for current_value in values:
most_frequent_class, error = train_feature_value(X, y_true, feature, current_value)
print('当前“特征值的取值”是: ', current_value)
print('计数结果最大的类别的编号是: ', most_frequent_class)
print('错误数量是: ', error)
# 预测器是一个字段类型,key是当前特征值的取值,value是最大计数的数目
predictors[current_value] = most_frequent_class
print('预测器[特征值的取值,最大计数类别的编号]是: ', predictors)
errors.append(error)
print('错误数的list各元素是: ', errors)
print('='*70)
# Compute the total error of using this feature to classify on
print('-'*120)
total_error = sum(errors)
print('预测器是: ', predictors)
print('总错误数是: ', total_error, '\n\n')
return predictors, total_error
# Compute what our predictors say each sample is based on its value
# y_predicted = np.array([predictors[sample[feature]] for sample in X])
def train_feature_value(X, y_true, feature, value):
# 声明一个字典形式的类别计数器
class_counts = defaultdict(int)
for sample, y in zip(X, y_true):
# 如果每一条记录的该特征值的取值(本例只取0或1),等于当前value(value是上一轮循环遍历的条件)
# 那么类别计数器 加 1 (计数器是字典形式,技术的基础,也就是其 key 就是 类别 y 的值)
if sample[feature] == value:
class_counts[y] += 1
# 对 class_counts 按照 数目 进行排序,逆序
sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1), reverse=True)
print('已按逆序排列的类别计数结果是: ', sorted_class_counts)
# 提取第一名的0位置的值,也就是数目最大的那个类别的编号
most_frequent_class = sorted_class_counts[0][0]
# The error is the number of samples that do not classify as the most frequent class
# *and* have the feature value.
n_samples = X.shape[1]
# 计算错误率:表示分类规则不适用的个体的数量
error = sum(
[
# 如果类别值 不等于 最频繁类别值
# 那么 遍历 类别值,和类别 所累计的特征的数目
# 取特征数目
# 此时生成一个 list,外部加总作为总错误数
class_count for class_value, class_count in class_counts.items() if class_value != most_frequent_class
]
)
# print('error is: ', error)
# 返回:计数数目最大的那个类别的编号,错误率(实际上是错误数目)
# 也即返回使用给定特征值得到的待预测个体的类别和错误率
# 比如,当 feature 取 0(计算第一个特征值)时,类别0的计数是39,最大。错误率是除0以外的类别(1、2)的计数数目的合计,即17+4=21
return most_frequent_class, error
# all_predictors 是一个字典
# key 是特征值编号
# value 是一个集合
# 集合的第一个元素是字典:预测器[特征值的取值,最大计数类别的编号]
# 集合的第二个元素是 总错误数
all_predictors = {
variable: train(
X_train
, y_train
, variable # feature 的取值
) for variable in range(X_train.shape[1])
}
print('all_predictors is: \n', all_predictors, '\n')
# 计算“错误数”,是一个字典,key 是 特征值编号,value 是 all_predictors 中对应的 总错误数
# 此处 for 后面的结构,与 all_predictors 的结果 一一对应
# {0: ({0: 0, 1: 2}, 39), 1: ({0: 1, 1: 0}, 51), 2: ({0: 0, 1: 2}, 35), 3: ({0: 0, 1: 2}, 35)}
errors = {variable: error for variable, (mapping, error) in all_predictors.items()}
print('errors is: \n', errors, '\n')
# 现在选择最佳模型
# 按照总错误数排序:{0: 39, 1: 51, 2: 35, 3: 35}
# 排序规则:一共有2列(key,value)即(0,1),取列1,即Value,此时是一个list,其中各个元素都是一个集合set
print('排序后的总错误列表是:', sorted(errors.items(), key=itemgetter(1)))
# 然后取list的第一个元素(是一个集合(含有2个元素)),分别赋值给等式左边的两个变量
print('取出第一个值(集合)是:', sorted(errors.items(), key=itemgetter(1))[0], '\n')
best_variable, best_error = sorted(errors.items(), key=itemgetter(1))[0]
print("The best model is based on variable {0} and has error {1:.2f}".format(best_variable, best_error))
# Choose the bset model
model = {'variable': best_variable,
'predictor': all_predictors[best_variable][0]}
print(model)
本轮“特征值编号”是: 0
------------------------------------------------------------------------------------------------------------------------
已按逆序排列的类别计数结果是: [(0, 33), (1, 15), (2, 4)]
当前“特征值的取值”是: 0
计数结果最大的类别的编号是: 0
错误数量是: 19
预测器[特征值的取值,最大计数类别的编号]是: {0: 0}
错误数的list各元素是: [19]
======================================================================
已按逆序排列的类别计数结果是: [(2, 38), (1, 22)]
当前“特征值的取值”是: 1
计数结果最大的类别的编号是: 2
错误数量是: 22
预测器[特征值的取值,最大计数类别的编号]是: {0: 0, 1: 2}
错误数的list各元素是: [19, 22]
======================================================================
------------------------------------------------------------------------------------------------------------------------
预测器是: {0: 0, 1: 2}
总错误数是: 41
本轮“特征值编号”是: 1
------------------------------------------------------------------------------------------------------------------------
已按逆序排列的类别计数结果是: [(1, 29), (2, 27), (0, 8)]
当前“特征值的取值”是: 0
计数结果最大的类别的编号是: 1
错误数量是: 35
预测器[特征值的取值,最大计数类别的编号]是: {0: 1}
错误数的list各元素是: [35]
======================================================================
已按逆序排列的类别计数结果是: [(0, 25), (2, 15), (1, 8)]
当前“特征值的取值”是: 1
计数结果最大的类别的编号是: 0
错误数量是: 23
预测器[特征值的取值,最大计数类别的编号]是: {0: 1, 1: 0}
错误数的list各元素是: [35, 23]
======================================================================
------------------------------------------------------------------------------------------------------------------------
预测器是: {0: 1, 1: 0}
总错误数是: 58
本轮“特征值编号”是: 2
------------------------------------------------------------------------------------------------------------------------
已按逆序排列的类别计数结果是: [(0, 33), (1, 6)]
当前“特征值的取值”是: 0
计数结果最大的类别的编号是: 0
错误数量是: 6
预测器[特征值的取值,最大计数类别的编号]是: {0: 0}
错误数的list各元素是: [6]
======================================================================
已按逆序排列的类别计数结果是: [(2, 42), (1, 31)]
当前“特征值的取值”是: 1
计数结果最大的类别的编号是: 2
错误数量是: 31
预测器[特征值的取值,最大计数类别的编号]是: {0: 0, 1: 2}
错误数的list各元素是: [6, 31]
======================================================================
------------------------------------------------------------------------------------------------------------------------
预测器是: {0: 0, 1: 2}
总错误数是: 37
本轮“特征值编号”是: 3
------------------------------------------------------------------------------------------------------------------------
已按逆序排列的类别计数结果是: [(0, 33), (1, 7)]
当前“特征值的取值”是: 0
计数结果最大的类别的编号是: 0
错误数量是: 7
预测器[特征值的取值,最大计数类别的编号]是: {0: 0}
错误数的list各元素是: [7]
======================================================================
已按逆序排列的类别计数结果是: [(2, 42), (1, 30)]
当前“特征值的取值”是: 1
计数结果最大的类别的编号是: 2
错误数量是: 30
预测器[特征值的取值,最大计数类别的编号]是: {0: 0, 1: 2}
错误数的list各元素是: [7, 30]
======================================================================
------------------------------------------------------------------------------------------------------------------------
预测器是: {0: 0, 1: 2}
总错误数是: 37
all_predictors is:
{0: ({0: 0, 1: 2}, 41), 1: ({0: 1, 1: 0}, 58), 2: ({0: 0, 1: 2}, 37), 3: ({0: 0, 1: 2}, 37)}
errors is:
{0: 41, 1: 58, 2: 37, 3: 37}
排序后的总错误列表是: [(2, 37), (3, 37), (0, 41), (1, 58)]
取出第一个值(集合)是: (2, 37)
The best model is based on variable 2 and has error 37.00
{'variable': 2, 'predictor': {0: 0, 1: 2}}
def predict(X_test, model):
variable = model['variable']
print('模型特征值编号是:', variable)
predictor = model['predictor']
print('模型预测器是:', predictor)
# for sample in X_test:
# print(sample[2],':', predictor[int(sample[variable])])
y_predicted = np.array([predictor[int(sample[variable])] for sample in X_test])
return y_predicted
y_predicted = predict(X_test, model)
print('预测类别编号是:', y_predicted)
模型特征值编号是: 2
模型预测器是: {0: 0, 1: 2}
预测类别编号是: [0 0 0 2 2 2 0 2 0 2 2 0 2 2 0 2 0 2 2 2 0 0 0 2 0 2 0 2 2 0 0 0 2 0 2 0 2
2]
# 比较两个数组,返回True的元素个数占总长度的比率
accuracy = np.mean(y_predicted == y_test) * 100
print("The test accuracy is {:.2f}%".format(accuracy))
The test accuracy is 65.79%
warnings.filterwarnings("ignore")
print(classification_report(y_test, y_predicted, target_names=["etosa","Versicolour","Virginica"]))
# warnings.filterwarnings("always")
precision recall f1-score support
etosa 0.94 1.00 0.97 17
Versicolour 0.00 0.00 0.00 13
Virginica 0.40 1.00 0.57 8
accuracy 0.66 38
macro avg 0.45 0.67 0.51 38
weighted avg 0.51 0.66 0.55 38