机器学习之贝叶斯样本分类

技术2022-07-11 107

1. 基本介绍

贝叶斯分类是基于贝叶斯定理和属性特征条件独立性的分类方法。大学《概率论与数理统计》课程中，条件概率的表达式：

2. scikit-learn示例

10个训练样本，二分类0/1

2.1 模块导入

import csv import numpy as np from sklearn.naive_bayes import GaussianNB import matplotlib.pyplot as plt

2.2 文本数据加载

def load_data(): ''' load file data ''' with open('F:/study/AI/src/ml/sklearn/bayes.csv') as csv_file: data = csv.reader(csv_file) first_row = next(data) n_sample = int(first_row[0]) n_feature = int(first_row[1]) train_sets = np.empty((n_sample, 2), dtype=np.float64) target_sets = np.empty((n_sample), dtype=np.int) for i, ir in enumerate(data): train_sets[i] = np.asarray(ir[:-1], dtype=np.float64) target_sets[i] = np.asarray(ir[-1], dtype=np.int) return n_sample, n_feature, train_sets, target_sets

2.3 Test

if __name__ == '__main__': n_sample, n_feature, train_sets, target_sets = load_data() print("样本数：",n_sample) print("特征数：",n_feature) print("训练数据集：\n", train_sets) print("样本数据对应的分类：\n", target_sets) bayes = GaussianNB().fit(train_sets, target_sets).predict([[3.0, 4.0]]) print("预测样本[3.0,4.0]的分类结果：", bayes) plt.scatter(train_sets[:5,0], train_sets[:5,1], color = 'black') plt.scatter(train_sets[5:,0], train_sets[5:,1], color = 'blue') plt.scatter([3.0], [4.0], color='red') plt.show()

2.4 结果打印

样本数： 10 特征数： 2 训练数据集： [[1.1 1.7] [0.5 1.2] [1.7 2.3] [3.4 4.5] [4.5 5.3] [2.1 1.3] [1.3 0.5] [3.2 2.3] [4.1 2.5] [2.5 1.8]] 样本数据对应的分类： [0 0 0 0 0 1 1 1 1 1] 预测样本[3.0,4.0]的分类结果： [0]

2.5 结果图示

3. 基本算法示例

此例子，参考博客：https://blog.csdn.net/qq_25948717/article/details/81744277。谢谢博主的分享。示例稍作改变，将组合进行了去重。

3.1 样本数据

3.2 条件概率公式

针对以下条件的概率公式

3.3 代码

3.3.1 特征组合

这里做了稍微的改变，原博客的特征组合有重复的。

#!/usr/bin/python # encoding: utf-8 from functools import reduce def gen_features(data_list, code=''): def zuhe(list1,list2): return [(i+code+j) for i in list1 for j in list2] return reduce(zuhe, data_list) def get_feats(data_list): res = gen_features(data_list, ',') res_list = [] for value in res: res_list.append(value.split(',')) return res_list if __name__ == '__main__': data = [['long', 'not_long'], ['sweet', 'not_sweet'], ['yellow', 'not_yellow']] print(get_feats(data))

3.3.2 条件概率计算

# !usr/bin/python # encoding : utf-8 datasets = {'banala':{'long':400,'not_long':100,'sweet':350,'not_sweet':150,'yellow':450,'not_yellow':50}, 'orange':{'long':0,'not_long':300,'sweet':150,'not_sweet':150,'yellow':300,'not_yellow':0}, 'other_fruit':{'long':100,'not_long':100,'sweet':150,'not_sweet':50,'yellow':50,'not_yellow':150} } def calcu_fruit_total(data): ''' calculate number for every fruit, and calculate only calculate the sum of "long" and "not_long" for fruit. Paras: data: trained data sets Result: total_num = 1000 #fruit total number fruit_count = {'banala':500, 'orange':300, 'other_fruit':200} ''' total_num = 0 fruit_count = {} for fruit in data: fruit_count[fruit] = data[fruit]['long'] + data[fruit]['not_long'] total_num += fruit_count[fruit] return total_num, fruit_count def feature_prob(data): ''' calculate probability of every feature. Paras: data: trained data sets Result: feature_count = {'not_long':0.5, 'not_sweet':0.35, 'not_yellow':0.2, 'sweet':0.65, 'yellow':0.8, 'long':0.5} ''' feature_count = {} flag = True total_num, fruit_count = calcu_fruit_total(data) feature_list = data['banala'].keys() for feature in feature_list: number = 0 for fruit in data: number += int(data[fruit][feature]) feature_count[feature] = number/total_num # for feature in feature_count: # print("feature : ", feature, ', num : ', feature_count[feature]) return feature_count def feature_cond_prob(data): ''' calculate every feature probability for known fruit. Paras: data: trained data sets Results: res = {'banala':{'long':0.8,'not_long':0.2,'sweet':0.7,'not_sweet':0.3,'yellow':0.9,'not_yellow':0.1}, 'orange':{'long':0.0,'not_long':1.0,'sweet':0.5,'not_sweet':0.5,'yellow':1.0,'not_yellow':0.0}, 'other_fruit':{'long':0.5,'not_long':0.5,'sweet':0.75,'not_sweet':0.25,'yellow':0.25,'not_yellow':0.75} } ''' res = {} total_num, fruit_count = calcu_fruit_total(data) for fruit in data: res[fruit] = {} features = data[fruit] for key in features: res[fruit][key] = features[key]/fruit_count[fruit] # print(fruit, key, res[fruit][key]) return res def fruit_prob(data): ''' calculate every fruit probability. Paras: data: trained data sets Result: res = {'banala':0.5, 'orange':0.3, 'other_fruit':0.2} ''' res = {} total_num, fruit_count = calcu_fruit_total(data) for fruit in fruit_count: res[fruit] = fruit_count[fruit]/total_num # for key, value in res.items(): # print(key, value) return res class Bayes_Classify(): ''' Bayes Classify class ''' def __init__(self, data = datasets): ''' init datasets,feature_prob,feature_cond_prob,fruit_prob ''' self.data = datasets self.feature_prob = feature_prob(self.data) self.feature_cond_prob = feature_cond_prob(self.data) self.fruit_prob = fruit_prob(self.data) self.fruit_list = [key for key in self.data.keys()] def get_fruit_cond_prob(self, features_combination): ''' get fruit classify result for known feature condition. Paras: features_combination: A combination of features. For example: ['not_long', 'not_sweet', 'not_yellow'] Result: res: a dictionary that every fruit prob for particular feature combination. For example: {'orange': 0.0, 'banala': 0.08571428571428573, 'other_fruit': 0.5357142857142858} ''' res = {} for fruit in self.fruit_list: prob = self.fruit_prob[fruit] for features_comb in features_combination: prob *= self.feature_cond_prob[fruit][features_comb]\ /self.feature_prob[features_comb] res[fruit] = prob # print(fruit, prob) return res if __name__ == '__main__': # total_num, fruit_count = calcu_fruit_total(datasets) # print('total_num = ', total_num) # for key in fruit_count: # print("key ", key, " = ", fruit_count[key]) # feature_count = feature_prob(datasets) # res = fruit_prob(datasets) # res = feature_cond_prob(datasets) # features_combination = ['long', 'sweet', 'yellow'] # classifer = Bayes_Classify() # classifer.get_fruit_cond_prob(features_combination) print("bayes_comp test end")

3.3.3 测试

# !/user/bin/python # encoding : utf-8 from bayes_comp import Bayes_Classify from generate_features import get_feats features_combination = [['long', 'not_long'], ['sweet', 'not_sweet'], ['yellow', 'not_yellow']] def main(): feat_combs = get_feats(features_combination) classifer = Bayes_Classify() for zuhe in feat_combs: print("特征值：",end='\t') print(zuhe) print("预测结果：", end='\t') res = classifer.get_fruit_cond_prob(zuhe) print(res)#预测属于哪种水果的概率 print('水果类别：',end='\t') #对后验概率排序，输出概率最大的标签 print(sorted(res.items(), key=lambda d:d[1], reverse=True)[0][0]) print('\n') if __name__ == '__main__': main()

3.3.4 结果

特征值： ['long', 'sweet', 'yellow'] 预测结果： {'orange': 0.0, 'other_fruit': 0.07211538461538461, 'banala': 0.9692307692307693} 水果类别： banala 特征值： ['long', 'sweet', 'not_yellow'] 预测结果： {'orange': 0.0, 'other_fruit': 0.8653846153846153, 'banala': 0.4307692307692308} 水果类别： other_fruit 特征值： ['long', 'not_sweet', 'yellow'] 预测结果： {'orange': 0.0, 'other_fruit': 0.04464285714285715, 'banala': 0.7714285714285716} 水果类别： banala 特征值： ['long', 'not_sweet', 'not_yellow'] 预测结果： {'orange': 0.0, 'other_fruit': 0.5357142857142858, 'banala': 0.3428571428571429} 水果类别： other_fruit 特征值： ['not_long', 'sweet', 'yellow'] 预测结果： {'orange': 0.5769230769230769, 'other_fruit': 0.07211538461538461, 'banala': 0.24230769230769234} 水果类别： orange 特征值： ['not_long', 'sweet', 'not_yellow'] 预测结果： {'orange': 0.0, 'other_fruit': 0.8653846153846153, 'banala': 0.1076923076923077} 水果类别： other_fruit 特征值： ['not_long', 'not_sweet', 'yellow'] 预测结果： {'orange': 1.0714285714285714, 'other_fruit': 0.04464285714285715, 'banala': 0.1928571428571429} 水果类别： orange 特征值： ['not_long', 'not_sweet', 'not_yellow'] 预测结果： {'orange': 0.0, 'other_fruit': 0.5357142857142858, 'banala': 0.08571428571428573} 水果类别： other_fruit

转载请注明原文地址:https://ipadbbs.8miu.com/read-14582.html

最新回复(0)