import numpy as np
from collections import defaultdict
from pprint import pprint
from operator import itemgetter
dataset_filename = 'rawdata/affinity_dataset.txt'
X = np.loadtxt(dataset_filename)
n_samples, n_features = X.shape
# 行代表购买的(订单)记录数
print('>>>-----> n_samples is: ', n_samples)
# 列作为特征值代表购买的商品种类
print('>>>-----> n_features is: ', n_features)
print('X is: \n', X[:5])
>>>-----> n_samples is: 100 >>>-----> n_features is: 5 X is: [[0. 0. 1. 1. 1.] [1. 1. 0. 1. 0.] [1. 0. 1. 1. 0.] [0. 0. 1. 1. 1.] [0. 1. 0. 0. 1.]]
# 5 个特征分别代表购买的 5 种水果
features = ["bread", "milk", "cheese", "apples", "bananas"]
# 示例计算跪着“购买了奶酪也会购买苹果”的支持度和置信度
X[:,2:4][:5]
array([[1., 1.],
[0., 1.],
[1., 1.],
[1., 1.],
[0., 0.]])
num_cheese_purchase = 0
for sample in X:
if sample[2] == 1:
num_cheese_purchase +=1
print("{0} people bought cheese".format(num_cheese_purchase))
41 people bought cheese
rule_valid = 0
rule_invalid = 0
for sample in X:
# 如果购买了 cheese
if sample[2] == 1:
# 且如果也购买了 apple
if sample[3] ==1:
rule_valid +=1
else:
rule_invalid +=1
print("{0} cases of the rule being valid were discovered".format(rule_valid))
print("{0} cases of the rule being invalid were discovered".format(rule_invalid))
25 cases of the rule being valid were discovered 16 cases of the rule being invalid were discovered
support = rule_valid
confidence = rule_valid / num_cheese_purchase
print("The support is {0} and the confidence is {1}/{2} = {3:.4f}.".format(support, rule_valid, num_cheese_purchase, confidence))
print("As a percentage, that is {0:.2f}%.".format(100 * confidence))
The support is 25 and the confidence is 25/41 = 0.6098. As a percentage, that is 60.98%.
# 符合规则的记录
valid_rules = defaultdict(int)
# 不符合规则的记录
invalid_rules = defaultdict(int)
# 该规则的总记录
num_occurences = defaultdict(int)
# 对全部数据进行遍历
for sample in X:
# 按照特征值进行遍历,premise的含义是某特征的状态是否购买
for premise in range(n_features):
# 如果某个特征值的状态为“未购买”,则继续执行
if sample[premise] == 0: continue
# 该选定特征值的规则的数目 自增 1
num_occurences[premise] += 1
# 按照特征值进行遍历,conclusion的含义是在外部循环的特征值为“购买”的状态下,本特征值的状态
for conclusion in range(n_features):
# 如果循环中的两个特征值相同,计算无意义,所以继续执行后续代码
if premise == conclusion: continue
# 如果本特征值状态为“购买”,则本规则的有效记录 自增 1
if sample[conclusion] == 1:
valid_rules[(premise, conclusion)] += 1
else:
# 否则,本规则的无效记录 自增 1
invalid_rules[(premise, conclusion)] += 1
# 全部规则支持度,即为有效记录数目
support = valid_rules
# 设定一个字典初始值
confidence = defaultdict(float)
# 计算全部规则的置信度(key 为两个特征值的组合,value 为有效记录数目[支持度]除以该规则总记录数目)
for premise, conclusion in valid_rules.keys():
confidence[(premise, conclusion)] = valid_rules[(premise, conclusion)] / num_occurences[premise]
for premise, conclusion in confidence:
premise_name = features[premise]
conclusion_name = features[conclusion]
print("规则: 如果购买了‘{0}’,那么也会购买‘{1}’。".format(premise_name, conclusion_name))
print(" - 置信度:{0:.2f}%".format(confidence[(premise, conclusion)]*100))
print(" - 支持度:{0}".format(support[(premise, conclusion)]))
print("-"*50)
规则: 如果购买了‘cheese’,那么也会购买‘apples’。
- 置信度:60.98%
- 支持度:25
--------------------------------------------------
规则: 如果购买了‘cheese’,那么也会购买‘bananas’。
- 置信度:65.85%
- 支持度:27
--------------------------------------------------
规则: 如果购买了‘apples’,那么也会购买‘cheese’。
- 置信度:69.44%
- 支持度:25
--------------------------------------------------
规则: 如果购买了‘apples’,那么也会购买‘bananas’。
- 置信度:58.33%
- 支持度:21
--------------------------------------------------
规则: 如果购买了‘bananas’,那么也会购买‘cheese’。
- 置信度:45.76%
- 支持度:27
--------------------------------------------------
规则: 如果购买了‘bananas’,那么也会购买‘apples’。
- 置信度:35.59%
- 支持度:21
--------------------------------------------------
规则: 如果购买了‘bread’,那么也会购买‘milk’。
- 置信度:51.85%
- 支持度:14
--------------------------------------------------
规则: 如果购买了‘bread’,那么也会购买‘apples’。
- 置信度:18.52%
- 支持度:5
--------------------------------------------------
规则: 如果购买了‘milk’,那么也会购买‘bread’。
- 置信度:30.43%
- 支持度:14
--------------------------------------------------
规则: 如果购买了‘milk’,那么也会购买‘apples’。
- 置信度:19.57%
- 支持度:9
--------------------------------------------------
规则: 如果购买了‘apples’,那么也会购买‘bread’。
- 置信度:13.89%
- 支持度:5
--------------------------------------------------
规则: 如果购买了‘apples’,那么也会购买‘milk’。
- 置信度:25.00%
- 支持度:9
--------------------------------------------------
规则: 如果购买了‘bread’,那么也会购买‘cheese’。
- 置信度:14.81%
- 支持度:4
--------------------------------------------------
规则: 如果购买了‘cheese’,那么也会购买‘bread’。
- 置信度:9.76%
- 支持度:4
--------------------------------------------------
规则: 如果购买了‘milk’,那么也会购买‘bananas’。
- 置信度:41.30%
- 支持度:19
--------------------------------------------------
规则: 如果购买了‘bananas’,那么也会购买‘milk’。
- 置信度:32.20%
- 支持度:19
--------------------------------------------------
规则: 如果购买了‘bread’,那么也会购买‘bananas’。
- 置信度:62.96%
- 支持度:17
--------------------------------------------------
规则: 如果购买了‘bananas’,那么也会购买‘bread’。
- 置信度:28.81%
- 支持度:17
--------------------------------------------------
规则: 如果购买了‘milk’,那么也会购买‘cheese’。
- 置信度:15.22%
- 支持度:7
--------------------------------------------------
规则: 如果购买了‘cheese’,那么也会购买‘milk’。
- 置信度:17.07%
- 支持度:7
--------------------------------------------------
def print_rule(premise, conclusion, support, confidence, features):
premise_name = features[premise]
conclusion_name = features[conclusion]
print("规则: 假如某人购买了‘{0}’那么他也会购买‘{1}’。".format(premise_name, conclusion_name))
print(" - 支持度:{0}".format(support[(premise, conclusion)]))
print(" - 置信度:{0:.2f}%".format(confidence[(premise, conclusion)]*100))
print("-"*70)
sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True)
for index in range(5):
print("Rule #{0}".format(index + 1))
(premise, conclusion) = sorted_support[index][0]
print_rule(premise, conclusion, support, confidence, features)
Rule #1
规则: 假如某人购买了‘cheese’那么他也会购买‘bananas’。
- 支持度:27
- 置信度:65.85%
----------------------------------------------------------------------
Rule #2
规则: 假如某人购买了‘bananas’那么他也会购买‘cheese’。
- 支持度:27
- 置信度:45.76%
----------------------------------------------------------------------
Rule #3
规则: 假如某人购买了‘cheese’那么他也会购买‘apples’。
- 支持度:25
- 置信度:60.98%
----------------------------------------------------------------------
Rule #4
规则: 假如某人购买了‘apples’那么他也会购买‘cheese’。
- 支持度:25
- 置信度:69.44%
----------------------------------------------------------------------
Rule #5
规则: 假如某人购买了‘apples’那么他也会购买‘bananas’。
- 支持度:21
- 置信度:58.33%
----------------------------------------------------------------------
sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)
for index in range(5):
print("Rule #{0}".format(index + 1))
(premise, conclusion) = sorted_confidence[index][0]
print_rule(premise, conclusion, support, confidence, features)
Rule #1
规则: 假如某人购买了‘apples’那么他也会购买‘cheese’。
- 支持度:25
- 置信度:69.44%
----------------------------------------------------------------------
Rule #2
规则: 假如某人购买了‘cheese’那么他也会购买‘bananas’。
- 支持度:27
- 置信度:65.85%
----------------------------------------------------------------------
Rule #3
规则: 假如某人购买了‘bread’那么他也会购买‘bananas’。
- 支持度:17
- 置信度:62.96%
----------------------------------------------------------------------
Rule #4
规则: 假如某人购买了‘cheese’那么他也会购买‘apples’。
- 支持度:25
- 置信度:60.98%
----------------------------------------------------------------------
Rule #5
规则: 假如某人购买了‘apples’那么他也会购买‘bananas’。
- 支持度:21
- 置信度:58.33%
----------------------------------------------------------------------