# 3.1.1信息增益

3小时前 1次浏览
``` 1 #计算给定数据集的香农熵
2 from math import log
3
4 def calcShannonEnt(dataSet):
5     numEntries = len(dataSet)  #样本条目数
6     labelCounts = {}
7     for featVec in dataSet:
8         currentLabel = featVec[-1]   #取每个样本最后一列值
9         labelCounts[currentLabel] = labelCounts.get(currentLabel,0)+1
10         #以上得到字典：{'yes':2,'no':3}
11     shannonEnt = 0.0
12     for key in labelCounts:
13         prob = float(labelCounts[key])/numEntries  #求得每个种类的概率
14         shannonEnt -= prob * log(prob,2)  #信息熵公式
15     return shannonEnt   #返回信息熵
16 '''
17         if currentLabel not in labelCounts.keys():   #填充字典：以currentLabel为key
18             labelCounts[currentLabel] = 0
19         labelCounts[currentLabel] += 1   #注意缩进
20         #以上得到字典：{'yes':2,'no':3}
21 '''
22 '''
23 if currentLabel not in labelCounts.keys():
24             labelCounts[currentLabel] = 1
25         else:
26             labelCounts[currentLabel] += 1
27         #以上得到字典：{'yes':2,'no':3}
28 '''
29 def createDataSet():
30     dataSet = [[1,1,'maybe'],
31                [1,1,'yes'],
32                [1,0,'no'],
33                [0,1,'no'],
34                [0,1,'no']]
35     labels = ['no surfacing','flippers']
36     return dataSet,labels```