forked from shunliz/Machine-Learning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
code.md
103 lines (92 loc) · 3.97 KB
/
code.md
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
```py
#coding=utf-8
'''''
'''
from math import log
import operator
def createDataSet():
dataSet =[[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']]
labels = ['no surfacing','flippers'] #分类的属性
return dataSet,labels
#计算给定数据的香农熵
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1] #获得标签
#构造存放标签的字典
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1 #对应的标签数目+1
#计算香农熵
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -=prob*log(prob,2)
return shannonEnt
#划分数据集,三个参数为带划分的数据集,划分数据集的特征,特征的返回值
def splitDataSet(dataSet,axis,value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] ==value:
#将相同数据集特征的抽取出来
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet #返回一个列表
#选择最好的数据集划分方式
def chooseBestFeatureToSplit(dataSet):
numFeature = len(dataSet[0])-1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0
beatFeature = -1
for i in range(numFeature):
featureList = [example[i] for example in dataSet] #获取第i个特征所有的可能取值
uniqueVals = set(featureList) #从列表中创建集合,得到不重复的所有可能取值ֵ
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet,i,value) #以i为数据集特征,value为返回值,划分数据集
prob = len(subDataSet)/float(len(dataSet)) #数据集特征为i的所占的比例
newEntropy +=prob * calcShannonEnt(subDataSet) #计算每种数据集的信息熵
infoGain = baseEntropy- newEntropy
#计算最好的信息增益,增益越大说明所占决策权越大
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature
#递归构建决策树
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys():
classCount[vote]=0
classCount[vote]+=1
sortedClassCount = sorted(classCount.iteritems(),key =operator.itemgetter(1),reverse=True)#排序,True升序
return sortedClassCount[0][0] #返回出现次数最多的
#创建树的函数代码
def createTree(dataSet,labels):
classList = [example[-1] for example in dataSet]
if classList.count(classList[0])==len(classList):
#if set(classList)==set([classList[0]]):#类别完全相同则停止划分
return classList[0]
if len(dataSet[0]) ==1: #遍历完所有特征值时返回出现次数最多的
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet) #选择最好的数据集划分方式
bestFeatLabel = labels[bestFeat] #得到对应的标签值
myTree = {bestFeatLabel:{}}
del(labels[bestFeat]) #清空labels[bestFeat],在下一次使用时清零
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels =labels[:]
#递归调用创建决策树函数
myTree[bestFeatLabel][value]=createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
return myTree
if __name__=="__main__":
dataSet,labels = createDataSet()
print createTree(dataSet,labels)
```