import numpy as np
from collections import defaultdict
from pprint import pprint
from operator import itemgetter
dataset_filename = 'rawdata/affinity_dataset.txt'
X = np.loadtxt(dataset_filename)
n_samples, n_features = X.shape
# 行代表购买的（订单）记录数
print('>>>-----> n_samples is: ', n_samples)
# 列作为特征值代表购买的商品种类
print('>>>-----> n_features is: ', n_features)
print('X is: \n', X[:5])

>>>-----> n_samples is:  100
>>>-----> n_features is:  5
X is: 
 [[0. 0. 1. 1. 1.]
 [1. 1. 0. 1. 0.]
 [1. 0. 1. 1. 0.]
 [0. 0. 1. 1. 1.]
 [0. 1. 0. 0. 1.]]


# 5 个特征分别代表购买的 5 种水果
features = ["bread", "milk", "cheese", "apples", "bananas"]


# 示例计算跪着“购买了奶酪也会购买苹果”的支持度和置信度
X[:,2:4][:5]

array([[1., 1.],
       [0., 1.],
       [1., 1.],
       [1., 1.],
       [0., 0.]])


num_cheese_purchase = 0
for sample in X:
    if sample[2] == 1:
        num_cheese_purchase +=1
print("{0} people bought cheese".format(num_cheese_purchase))

41 people bought cheese


rule_valid = 0
rule_invalid = 0
for sample in X:
    # 如果购买了 cheese
    if sample[2] == 1:
        # 且如果也购买了 apple
        if sample[3] ==1:
            rule_valid +=1
        else:
            rule_invalid +=1
print("{0} cases of the rule being valid were discovered".format(rule_valid))
print("{0} cases of the rule being invalid were discovered".format(rule_invalid))

25 cases of the rule being valid were discovered
16 cases of the rule being invalid were discovered


support = rule_valid
confidence = rule_valid / num_cheese_purchase
print("The support is {0} and the confidence is {1}/{2} = {3:.4f}.".format(support, rule_valid, num_cheese_purchase, confidence))
print("As a percentage, that is {0:.2f}%.".format(100 * confidence))

The support is 25 and the confidence is 25/41 = 0.6098.
As a percentage, that is 60.98%.


# 符合规则的记录
valid_rules = defaultdict(int)
# 不符合规则的记录
invalid_rules = defaultdict(int)
# 该规则的总记录
num_occurences = defaultdict(int)

# 对全部数据进行遍历
for sample in X:
    # 按照特征值进行遍历，premise的含义是某特征的状态是否购买
    for premise in range(n_features):
        # 如果某个特征值的状态为“未购买”，则继续执行
        if sample[premise] == 0: continue
        # 该选定特征值的规则的数目 自增 1    
        num_occurences[premise] += 1
        # 按照特征值进行遍历，conclusion的含义是在外部循环的特征值为“购买”的状态下，本特征值的状态
        for conclusion in range(n_features):
            # 如果循环中的两个特征值相同，计算无意义，所以继续执行后续代码
            if premise == conclusion: continue
            # 如果本特征值状态为“购买”，则本规则的有效记录 自增 1    
            if sample[conclusion] == 1:
                valid_rules[(premise, conclusion)] += 1
            else:
                # 否则，本规则的无效记录 自增 1
                invalid_rules[(premise, conclusion)] += 1
# 全部规则支持度，即为有效记录数目
support = valid_rules
# 设定一个字典初始值
confidence = defaultdict(float)
# 计算全部规则的置信度（key 为两个特征值的组合，value 为有效记录数目[支持度]除以该规则总记录数目）
for premise, conclusion in valid_rules.keys():
    confidence[(premise, conclusion)] = valid_rules[(premise, conclusion)] / num_occurences[premise]


for premise, conclusion in confidence:
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("规则: 如果购买了‘{0}’，那么也会购买‘{1}’。".format(premise_name, conclusion_name))
    print("    - 置信度：{0:.2f}%".format(confidence[(premise, conclusion)]*100))
    print("    - 支持度：{0}".format(support[(premise, conclusion)]))
    print("-"*50)

规则: 如果购买了‘cheese’，那么也会购买‘apples’。
    - 置信度：60.98%
    - 支持度：25
--------------------------------------------------
规则: 如果购买了‘cheese’，那么也会购买‘bananas’。
    - 置信度：65.85%
    - 支持度：27
--------------------------------------------------
规则: 如果购买了‘apples’，那么也会购买‘cheese’。
    - 置信度：69.44%
    - 支持度：25
--------------------------------------------------
规则: 如果购买了‘apples’，那么也会购买‘bananas’。
    - 置信度：58.33%
    - 支持度：21
--------------------------------------------------
规则: 如果购买了‘bananas’，那么也会购买‘cheese’。
    - 置信度：45.76%
    - 支持度：27
--------------------------------------------------
规则: 如果购买了‘bananas’，那么也会购买‘apples’。
    - 置信度：35.59%
    - 支持度：21
--------------------------------------------------
规则: 如果购买了‘bread’，那么也会购买‘milk’。
    - 置信度：51.85%
    - 支持度：14
--------------------------------------------------
规则: 如果购买了‘bread’，那么也会购买‘apples’。
    - 置信度：18.52%
    - 支持度：5
--------------------------------------------------
规则: 如果购买了‘milk’，那么也会购买‘bread’。
    - 置信度：30.43%
    - 支持度：14
--------------------------------------------------
规则: 如果购买了‘milk’，那么也会购买‘apples’。
    - 置信度：19.57%
    - 支持度：9
--------------------------------------------------
规则: 如果购买了‘apples’，那么也会购买‘bread’。
    - 置信度：13.89%
    - 支持度：5
--------------------------------------------------
规则: 如果购买了‘apples’，那么也会购买‘milk’。
    - 置信度：25.00%
    - 支持度：9
--------------------------------------------------
规则: 如果购买了‘bread’，那么也会购买‘cheese’。
    - 置信度：14.81%
    - 支持度：4
--------------------------------------------------
规则: 如果购买了‘cheese’，那么也会购买‘bread’。
    - 置信度：9.76%
    - 支持度：4
--------------------------------------------------
规则: 如果购买了‘milk’，那么也会购买‘bananas’。
    - 置信度：41.30%
    - 支持度：19
--------------------------------------------------
规则: 如果购买了‘bananas’，那么也会购买‘milk’。
    - 置信度：32.20%
    - 支持度：19
--------------------------------------------------
规则: 如果购买了‘bread’，那么也会购买‘bananas’。
    - 置信度：62.96%
    - 支持度：17
--------------------------------------------------
规则: 如果购买了‘bananas’，那么也会购买‘bread’。
    - 置信度：28.81%
    - 支持度：17
--------------------------------------------------
规则: 如果购买了‘milk’，那么也会购买‘cheese’。
    - 置信度：15.22%
    - 支持度：7
--------------------------------------------------
规则: 如果购买了‘cheese’，那么也会购买‘milk’。
    - 置信度：17.07%
    - 支持度：7
--------------------------------------------------


def print_rule(premise, conclusion, support, confidence, features):
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("规则: 假如某人购买了‘{0}’那么他也会购买‘{1}’。".format(premise_name, conclusion_name))
    print("    - 支持度：{0}".format(support[(premise, conclusion)]))
    print("    - 置信度：{0:.2f}%".format(confidence[(premise, conclusion)]*100))
    print("-"*70)


sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True)
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_support[index][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
规则: 假如某人购买了‘cheese’那么他也会购买‘bananas’。
    - 支持度：27
    - 置信度：65.85%
----------------------------------------------------------------------
Rule #2
规则: 假如某人购买了‘bananas’那么他也会购买‘cheese’。
    - 支持度：27
    - 置信度：45.76%
----------------------------------------------------------------------
Rule #3
规则: 假如某人购买了‘cheese’那么他也会购买‘apples’。
    - 支持度：25
    - 置信度：60.98%
----------------------------------------------------------------------
Rule #4
规则: 假如某人购买了‘apples’那么他也会购买‘cheese’。
    - 支持度：25
    - 置信度：69.44%
----------------------------------------------------------------------
Rule #5
规则: 假如某人购买了‘apples’那么他也会购买‘bananas’。
    - 支持度：21
    - 置信度：58.33%
----------------------------------------------------------------------


sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
规则: 假如某人购买了‘apples’那么他也会购买‘cheese’。
    - 支持度：25
    - 置信度：69.44%
----------------------------------------------------------------------
Rule #2
规则: 假如某人购买了‘cheese’那么他也会购买‘bananas’。
    - 支持度：27
    - 置信度：65.85%
----------------------------------------------------------------------
Rule #3
规则: 假如某人购买了‘bread’那么他也会购买‘bananas’。
    - 支持度：17
    - 置信度：62.96%
----------------------------------------------------------------------
Rule #4
规则: 假如某人购买了‘cheese’那么他也会购买‘apples’。
    - 支持度：25
    - 置信度：60.98%
----------------------------------------------------------------------
Rule #5
规则: 假如某人购买了‘apples’那么他也会购买‘bananas’。
    - 支持度：21
    - 置信度：58.33%
----------------------------------------------------------------------

亲和性分析

概念¶

准备数据¶

示例计算support和confidence¶

rule：购买了cheese也会购买apples¶

循环计算全部演示数据¶

查看计算结果¶

按支持度逆序排列（取前5名）¶

按置信度逆序排列（取前5名）¶