本文中講解的是使用
sklearn
實作決策樹及其模組化過程,包含
- 資料的清洗和資料分離
train_test_split
- 采用不同的名額,基尼系數或者資訊熵進行模組化,使用的是X_train和y_train
- 執行個體化
-
拟合fit
- 預測功能:采用上面的兩種執行個體化進行預測
y_pred = clf_gini.predict(X_test)
- 結果評估
- 混淆矩陣
- 準确率
- 分類報告
![](https://img.laitimes.com/img/__Qf2AjLwojIjJCLyojI0JCLiAjM2EzLcd3LcJzLcJzdllmVldWYtl2PnVGcq5CMlFXOrVzahxmavwVM3EDMzQTNtUGall3LcVmdhNXLwRHdo9CXt92YucWbpRWdvx2Yx5yazF2Lc9CX6MHc0RHaiojIsJye.jpeg)
封裝成函數實作
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix # 混淆矩陣
from sklearn.model_selection import train_test_split # 資料分離子產品
from sklearn.tree import DecisionTreeClassifier # 分類決策樹
from sklearn.metrics import accuracy_score # 評價名額
from sklearn.metrics import classification_report # 生成分類結果報告子產品
# 讀取資料 importing data
def load_data():
balance_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-'+'databases/balance-scale/balance-scale.data',sep=',',header=None) # 導入資料集,同時設定頭部
print("Dataset Length", len(balance_data))
print(balance_data.head())
return balance_data
# 訓練集和測試集的分離 splitting the dataset into train and test
def split_dataset(balance_data):
X = balance_data.values[:, 1:5] # 提取特征資料
y = balance_data.values[:, 0] # 提取資料标簽
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.3,
random_state=100) # 進行資料分離
return X, y, X_train, X_test, y_train, y_test
# 使用基尼系數進行訓練 training with giniIndex
def train_using_gini(X_train, y_train):
# 先建立執行個體,再進行fit拟合
clf_gini = DecisionTreeClassifier(criterion="gini" # 執行個體化
,random_state=100
,max_depth=3
,min_samples_leaf=5)
clf_gini.fit(X_train, y_train) # fit拟合
return clf_gini
# 使用資訊熵進行訓練 training with entropy
def train_using_entropy(X_train, y_train):
# 執行個體化+fit拟合
clf_entropy = DecisionTreeClassifier(criterion="entropy"
,random_state=100
,max_depth=3
,min_samples_leaf=5)
clf_entropy.fit(X_train, y_train)
return clf_entropy
# 預測功能 make predictions
def prediction(X_test, clf_object):
y_pred = clf_object.predict(X_test)
print("Predicted vlaues:")
print(y_pred)
return y_pred
# 計算準确率 calculate accuracy
def cal_accuracy(y_test, y_pred):
print("Confusion Matrix:", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred)*100)
print("Report:", classification_report(y_test, y_pred))
def main():
data = load_data()
X, y, X_train, X_test, y_train, y_test = split_dataset(data)
clf_gini = train_using_gini(X_train, y_train)
clf_entropy = train_using_entropy(X_train, y_train)
print("result using gini Index:")
y_pred_gini = prediction(X_test, clf_gini)
cal_accuracy(y_test, y_pred_gini)
print("result using Entropy:")
y_pred_entropy = prediction(X_test, clf_entropy)
cal_accuracy(y_test, y_pred_entropy)
if __name__ == "__main__":
main()
複制