本文代码以及相应数据集 ID3算法:核心是在决策树各个节点上应用 信息增益 准则选择特征,递归的构建决策树。具体方法是:从根结点开始,对结点计算所有可能的特征的信息增益,选择信息增益最大的特征作为结点的特征,由该特征的不同取值建立子结点;再对子结点递归的调用以上方法,构建决策树;直到所有特征的信息增益均很小或没有特征可以选择为止。ID3算法还存在另一个问题,它不能直接出来连续型特征。只有事先将连续型特征转为离散型,才能在ID3中使用。 但这种转换过程会破坏连续型变量的内在性质。由于数据集是连续的数据,所以本实验采用的按区域划分的方法,将连续数据划分为离散数据。如下图所示: 连续数据: 离散数据: 1 ID3 决策分类树 在数据集Wine中,构建的决策图如下所示: 根据所构建的决策树,分类的准确率如下图所示:
python 代码:
from math import log import matplotlib import matplotlib.pyplot as plt from graphviz import Digraph def loadSplitDataSet(txtname,rate): file = open(txtname) lines1 = file.readlines() file.close #print(lines1) lines2=[] for line in lines1: lineTemp=line.replace('\n','').split(',') lines2.append(lineTemp) step=int(1/(1-rate)) testSet=lines2[::step] del lines2[::step] trainSet=lines2 trainData=[] testData=[] trainLabel=[] testLabel=[] for x in trainSet: trainDataTemp=[] trainLabel.append(int(x[0])) for y in x[1:]: trainDataTemp.append(float(y)) trainData.append(trainDataTemp) for x in testSet: testDataTemp=[] testLabel.append(int(x[0])) for y in x[1:]: testDataTemp.append(float(y)) testData.append(testDataTemp) #print(len(trainData)) #print(len(testData)) #print(len(trainLabel)) #print(len(testLabel)) return trainData,testData,trainLabel,testLabel class DecisonTree: trainData = [] trainLabel = [] featureValus = {} #每个特征所有可能的取值 interval=[] def __init__(self, trainData, trainLabel, threshold): self.loadData(trainData, trainLabel) self.threshold = threshold self.tree = self.createTree(range(0,len(trainLabel)), range(0,len(trainData[0]))) #加载数据 def loadData(self, trainData, trainLabel): if len(trainData) != len(trainLabel): raise ValueError('input error') self.trainData = trainData self.trainLabel = trainLabel self.trainDataDiscrete() #计算 featureValus for data in self.trainData: for index, value in enumerate(data): if not index in self.featureValus.keys(): self.featureValus[index] = [value] if not value in self.featureValus[index]: self.featureValus[index].append(value) #连续数据离散化,均值离散 def trainDataDiscrete(self): trainData=[] interval=[] countClass=len(set(self.trainLabel)) if(countClass==0): countClass=1 for i in range(len(self.trainData[0])): tempData=[x[i] for x in self.trainData] step=(max(tempData)-min(tempData))/countClass intervalTemp=[] trainDataTemp=[] for j in range(1,countClass+1): intervalTemp.append(min(tempData)+j*step) interval.append(intervalTemp) for x in tempData: mark=countClass for (index,y) in enumerate(intervalTemp): if(x<y and x>=y-step): mark=index+1 trainDataTemp.append(mark) trainData.append(trainDataTemp) trainData=self.transpose(trainData) self.interval=interval self.trainData=trainData def transpose(self, matrix): new_matrix = [] for i in range(len(matrix[0])): matrix1 = [] for j in range(len(matrix)): matrix1.append(matrix[j][i]) new_matrix.append(matrix1) return new_matrix #计算信息熵 def caculateEntropy(self, dataset): labelCount = self.labelCount(dataset) size = len(dataset) result = 0 for i in labelCount.values(): pi = i / float(size) result -= pi * (log(pi) /log(2)) return result #计算信息增益 def caculateGain(self, dataset, feature): values = self.featureValus[feature] #特征feature 所有可能的取值 result = 0 for v in values: subDataset = self.splitDataset(dataset=dataset, feature=feature, value=v) result += len(subDataset) / float(len(dataset)) * self.caculateEntropy(subDataset) return self.caculateEntropy(dataset=dataset) - result def splitDataset(self, dataset, feature, value): reslut = [] for index in dataset: if self.trainData[index][feature] == value: reslut.append(index) return reslut #计算数据集中,每个标签出现的次数 def labelCount(self, dataset): labelCount = {} for i in dataset: if trainLabel[i] in labelCount.keys(): labelCount[trainLabel[i]] += 1 else: labelCount[trainLabel[i]] = 1 return labelCount ''' dataset:数据集 features:特征集 ''' def createTree(self, dataset, features): labelCount = self.labelCount(dataset) #如果特征集为空,则该树为单节点树 #计算数据集中出现次数最多的标签 if not features: return max(list(labelCount.items()),key = lambda x:x[1])[0] #如果数据集中,只包同一种标签,则该树为单节点树 if len(labelCount) == 1: return list(labelCount.keys())[0] #计算特征集中每个特征的信息增益 #l = map(lambda x : [x, self.caculateGain(dataset=dataset, feature=x)], features) l = list(map(lambda x : [x, self.caculateGain(dataset=dataset, feature=x)], features)) #print(l) if len(l) == 0: return max(list(labelCount.items()),key = lambda x:x[1])[0] #选取信息增益最大的特征 feature, gain = max(l, key = lambda x: x[1]) #如果最大信息增益小于阈值,则该树为单节点树 # if self.threshold > gain: return max(list(labelCount.items()),key = lambda x:x[1])[0] tree = {} #选取特征子集 subFeatures = filter(lambda x : x != feature, features) subFeatures = list(subFeatures) #print(self.featureValus[feature]) #tree['feature'] = feature #构建子树 for value in self.featureValus[feature]: subDataset = self.splitDataset(dataset=dataset, feature=feature, value=value) #保证子数据集非空 if not subDataset: continue tree[value] = self.createTree(dataset=subDataset, features=subFeatures) return tree def classify(self, data): count=0 def f(tree, data,count=0): if type(tree) != dict: #print(count) return tree else: count+=1 #print(tree[data[tree['feature']]]) return f(tree[data[tree['feature']]], data,count) return f(self.tree, data,count) #连续数据离散化,均值离散 def trainDataDiscrete(trainData,interval): matrix=[] for i in range(len(trainData[0])): tempData=[x[i] for x in trainData] trainDataTemp=[] for x in tempData: mark=3 last=0 for (index,y) in enumerate(interval[i]): if(x<y and x>=last): mark=index+1 last=y trainDataTemp.append(mark) matrix.append(trainDataTemp) new_matrix = [] for i in range(len(matrix[0])): matrix1 = [] for j in range(len(matrix)): matrix1.append(matrix[j][i]) new_matrix.append(matrix1) return new_matrix ''' 画决策树''' def plot_model(tree, name): g = Digraph("G", filename=name, format='png', strict=False) first_label = 'root' g.node("0", str(first_label)) #tree={first_label:tree} #print(tree) _sub_plot(g, tree, "0") g.view() root = "0" def _sub_plot(g, tree, inc): global root #first_label = list(tree.keys())[0] #ts = tree[first_label] if(isinstance(tree, dict)): tslist=list(tree.keys()) else: tslist=[tree] print("tsList:",tslist) for i in tslist: ''' if(isinstance(tree[first_label], dict)): treeDict=tree[first_label] else: treeDict={i:first_label} print('treeDict:',treeDict) ''' if isinstance(tree[i], dict): root = str(int(root) + 1) g.node(root, str(i)) g.edge(inc, root, str(i)) _sub_plot(g, tree[i], root) else: root = str(int(root) + 1) g.node(root, str(tree[i])) g.edge(inc, root, str(i)) if __name__ == '__main__': trainData,testData,trainLabel,testLabel=loadSplitDataSet(r'C:\Users\huawei\Desktop\统计学习理论\实验三\classify\wine.data',0.8) tree = DecisonTree(trainData=trainData, trainLabel=trainLabel, threshold=0) print(tree.tree) plot_model(tree.tree, "hello.gv") ''' newTestData=trainDataDiscrete(testData,tree.interval) count=0 for i in range(len(newTestData)): #print(newTestData[i]) try: tree.classify(newTestData[i]) if(tree.classify(newTestData[i])==testLabel[i]): count+=1 except Exception as e: print(Exception,',',e) print('The accuracy is %.2f'%(count/ len(testLabel))) '''