通過sklearn中的紅酒訓練集測試并且畫樹
儲存到了桌面的pdf檔案中,目前還沒有進行中文顯示問題。
from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
import pandas as pd
wine = load_wine()
wine.data.shape
wine.target
pd.concat([pd.DataFrame(wine.data),pd.DataFrame(wine.target)],axis=1)
wine.feature_names
wine.target_names
Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3)
clf = tree.DecisionTreeClassifier(criterion="entropy")
clf = clf.fit(Xtrain, Ytrain)
score = clf.score(Xtest, Ytest) #傳回預測的準确度
print(score)
feature_name = ['酒精','蘋果酸','灰','灰的堿性','鎂','總酚','類黃酮','非黃烷類酚類','花青素','顔 色強度','色調','od280/od315稀釋葡萄酒','脯氨酸']
import graphviz
dot_data=tree.export_graphviz(clf
,feature_names=feature_name
,class_names=["琴酒","雪莉","貝爾摩德"]
,filled=True
,rounded=True
)
graph = graphviz.Source(dot_data)
graph.render("C:\\Users\\JYuXuAN\\Desktop\\tree")
sklearn決策樹與随機森林的差異
随機森林是利用了裝袋法,把許多棵樹集合在一起得到最優的值。
精度一定比單一的決策樹要高出很多。
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, cross_val_score
import matplotlib.pyplot as plt
wine = load_wine()
Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data, wine.target, test_size=0.3)
# clf = DecisionTreeClassifier(random_state=0)
# rfc = RandomForestClassifier(random_state=0)
# clf = clf.fit(Xtrain, Ytrain)
# rfc = rfc.fit(Xtrain, Ytrain)
# score_c = clf.score(Xtest, Ytest)
# score_r = rfc.score(Xtest, Ytest)
#
# print("Single Tree:{}".format(score_c))
# print("Random Forest:{}.".format(score_r))
rfc_l=[]
clf_l=[]
for i in range(10):
rfc = RandomForestClassifier(n_estimators=25)
rfc_s = cross_val_score(rfc,wine.data,wine.target,cv=10).mean()
print(rfc_s)
# print(f"随機森林第{i}次:"+rfc_s)
rfc_l.append(rfc_s)
clf = DecisionTreeClassifier()
clf_s = cross_val_score(clf,wine.data,wine.target,cv=10).mean()
print(clf_s)
# print(f"決策樹第{i}次:"+clf_s)
clf_l.append(clf_s)
plt.plot(range(1,11),rfc_l,label = "Random Forest")
plt.plot(range(1,11),clf_l,label = "Decision Tree")
plt.legend()
plt.show()
畫出的圖像
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, cross_val_score
import matplotlib.pyplot as plt
wine=load_wine()
superpa = []
for i in range(200):
rfc = RandomForestClassifier(n_estimators=i+1,n_jobs=-1)
rfc_s = cross_val_score(rfc,wine.data,wine.target,cv=10).mean()
superpa.append(rfc_s)
print(max(superpa),superpa.index(max(superpa)))
plt.figure(figsize=[20,5])
plt.plot(range(1,201),superpa)
plt.show()
調參n_estimators 200 次 左右的結果資料圖