機器學習-決策樹-ID3決策樹
原理看上一篇,這篇隻有代碼實作
它以資訊熵為度量标準,劃分出決策樹特征節點,每次優先選取資訊量最多的屬性,也就是使資訊熵變為最小的屬性,以構造一顆資訊熵下降最快的決策樹。
缺點
代碼
from numpy import *
import math
import copy
import pickle
# ID3決策樹的實作
class ID3DTree(object):
def __init__(self): # 構造方法
self.tree = {} # 生成的樹
self.dataSet = [] # 資料集
self.labels = [] # 标簽集
# 資料導入函數
def loadDataSet(self, path, labels):
recordlist = []
fp = open(path, "r") # 讀取檔案内容
content = fp.read()
fp.close()
rowlist = content.splitlines() # 按行轉換為一維表
recordlist = [row.split("\t") for row in rowlist if row.strip()]
self.dataSet = recordlist
self.labels = labels
# 執行決策樹函數
def train(self):
labels = copy.deepcopy(self.labels)
self.tree = self.buildTree(self.dataSet, labels)
# 建立決策樹主程式
def buildTree(self, dataSet, labels):
cateList = [data[-1] for data in dataSet] # 抽取源資料集的決策标簽列
# 程式終止條件1:如果classList隻有一種決策标簽,停止劃分,傳回這個決策标簽
if cateList.count(cateList[0]) == len(cateList):
return cateList[0]
# 程式終止條件2:如果資料集的第一個決策标簽隻有一個,則傳回這個決策标簽
if len(dataSet[0]) == 1:
return self.maxCate(cateList)
# 算法核心:
bestFeat = self.getBestFeat(dataSet) # 傳回資料集的最優特征軸
bestFeatLabel = labels[bestFeat]
tree = {bestFeatLabel:{}}
del(labels[bestFeat])
#抽取最優特征軸的列向量
uniqueVals = set([data[bestFeat] for data in dataSet]) #去重
for value in uniqueVals: # 決策樹遞歸生長
subLabels = labels[:] # 将删除後的特征類别集建立子類别集
# 按最優特征列和值分隔資料集
splitDataset = self.splitDataSet(dataSet, bestFeat, value)
subTree = self.buildTree(splitDataset, subLabels) # 建構子樹
tree[bestFeatLabel][value] = subTree
return tree
# 計算出現次數最多的類别标簽
def maxCate(self, catelist):
items = dict([(catelist.count(i), i) for i in catelist])
return items[max(items.keys())]
# 計算最優特征
def getBestFeat(self, dataSet):
# 計算特征向量維,其中最後一列用于類别标簽,是以要減去
numFeatures = len(dataSet[0]) - 1 # 特征向量維數=行向量次元-1
baseEntropy = self.computeEntropy(dataSet) # 基礎熵:源資料的香農熵
bestInfoGain = 0.0 # 初始化最優的資訊增益
bestFeature = -1 # 初始化最優的特征軸
# 外循環:周遊資料集各列,計算最優特征軸
# i 為資料集列索引:取值範圍 0-(numFeatures-1)
for i in range(numFeatures): # 抽取第i列的列向量
uniqueVals = set([data[i] for data in dataSet]) # 去重:該列的唯一值集
newEntropy = 0.0 # 初始化該列的香農熵
for value in uniqueVals: # 内循環:按列和唯一值計算香農熵
# 按標明列i和唯一值分隔資料集
subDataSet = self.splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet)) # 即類别發生的機率
newEntropy += prob * self.computeEntropy(subDataSet) # 子集資訊熵或期望=類别子集發生的機率 * 資訊熵
infoGain = baseEntropy - newEntropy # 計算最大增益
if (infoGain > bestInfoGain): # 如果資訊增益>0
bestInfoGain = infoGain # 用目前資訊增益值替代之前的最優增益值
bestFeature = i # 重置最優特征為目前列
return bestFeature
# 計算資訊熵
def computeEntropy(self, dataSet):
datalen = float(len(dataSet))
cateList = [data[-1] for data in dataSet] # 從資料集中得到類别标簽
# 得到類别為key、出現次數value的字典
items = dict([(i, cateList.count(i)) for i in cateList])
infoEntropy = 0.0 # 初始化香農熵
for key in items: # 香農熵:
prob = float(items[key]) / datalen
infoEntropy -= prob * math.log(prob, 2)
return infoEntropy
# 劃分資料集;分隔資料集;删除特征軸所在的資料列,傳回剩餘的資料集
# dataSet:資料集 axis:特征軸 value:特征軸的取值
def splitDataSet(self, dataSet, axis, value):
rtnList = []
for featVec in dataSet:
if featVec[axis] == value:
rFeatVec = featVec[:axis] # list操作:提取0~(axis-1)的元素
rFeatVec.extend(featVec[axis+1:]) # list操作:将特征軸(列)之後的元素加回
rtnList.append(rFeatVec)
return rtnList
# 分類
def predict(self, inputTree, featLabels, testVec):
root = list(inputTree.keys())[0] # 樹根節點
secondDict = inputTree[root] # value-子樹結構或分類标簽
featIndex = featLabels.index(root) # 根節點在分類标簽集中的位置
key = testVec[featIndex] # 測試集數組取值
valueOfFeat = secondDict[key]
if isinstance(valueOfFeat, dict):
classLabel = self.predict(valueOfFeat, featLabels, testVec) #遞歸分類
else: classLabel = valueOfFeat
return classLabel
# 持久化
def storeTree(self, inputTree, filename):
fw = open(filename, 'wb')
pickle.dump(inputTree, fw)
fw.close()
# 從檔案抓取樹
def grabTree(self, filename):
fr = open(filename,'rb')
return pickle.load(fr)
#訓練
dtree = ID3DTree()
dtree.loadDataSet("/Users/FengZhen/Desktop/accumulate/機器學習/決策樹/決策樹訓練集.txt", ["age", "revenue", "student", "credit"])
dtree.train()
print(dtree.tree)
#持久化
# dtree.storeTree(dtree.tree, "/Users/FengZhen/Desktop/accumulate/機器學習/決策樹/決策樹.tree")
mytree = dtree.grabTree("/Users/FengZhen/Desktop/accumulate/機器學習/決策樹/決策樹.tree")
print(mytree)
#測試
labels = ["age", "revenue", "student", "credit"]
vector = ['0','1','0','0']
print(dtree.predict(mytree, labels, vector))