R&python的决策树实现及调参
- R实现
- python实现
注:本文不涉及决策树理论部分,若有兴趣请移步☞https://blog.csdn.net/weixin_43462348/article/details/101975391☜
步骤概览: { 1. 导 入 数 据 + 了 解 数 据 2. 数 据 预 处 理 3. 数 据 分 割 4. 建 模 5. 调 参 \begin{cases}1.导入数据+了解数据\\2.数据预处理\\3.数据分割\\4.建模\\5.调参\end{cases} ⎩⎪⎪⎪⎪⎪⎪⎨⎪⎪⎪⎪⎪⎪⎧1.导入数据+了解数据2.数据预处理3.数据分割4.建模5.调参
R实现
0 加载所需包
library(rpart)
library(rattle) # 画图工具
library(RColorBrewer) # 调色板
1 导入数据+了解数据
setwd("E:/r") # 数据储存至目录下
data<- read.csv("taitanic.csv",stringsAsFactors=F,na.strings="")
View(data) # 初探数据,保证无串行情况,确认导入正确
head(data) # 查看前几行
str(data) # 查看数据结构
![](https://img.laitimes.com/img/9ZDMuAjOiMmIsIjOiQnIsICM38FdsYkRGZkRG9lcvx2bjxiNx8VZ6l2cs0TPB9EMNpWTyEleNBDOsJGcohVYsR2MMBjVtJWd0ckW65UbM5WOHJWa5kHT20ESjBjUIF2X0hXZ0xCMx81dvRWYoNHLrdEZwZ1Rh5WNXp1bwNjW1ZUba9VZwlHdssmch1mclRXY39CXldWYtlWPzNXZj9mcw1ycz9WL49zZuBnL1EjN2ADOxkDM4EjMwAjMwIzLc52YucWbp5GZzNmLn9Gbi1yZtl2Lc9CX6MHc0RHaiojIsJye.png)
library(Hmisc)
describe(data)
'Name’变量有891个不同取值,考虑无实际用途;'Ticket’票号没有实际意义;Age和Cabin存在缺失值,其中Cabin缺失值超过50%,考虑删除,Embarked仅两个缺失值考虑直接删除所在行。
2 数据预处理
①删除上述无用变量
library(dplyr)
data <- data[,!(colnames(data) %in% c('Name','Ticket','Cabin'))]
②缺失值处理
library(VIM)
aggr(data, plot=T)
#均值填补
data[is.na(data$Age),"Age"]=mean(data$Age,na.rm=T)
data <- data[complete.cases(data),] # 取不含缺失的所有数据
#剩余缺失值删除
data <- data[complete.cases(data),]
处理后数据查看:
③数据转换
data$Survived <- as.factor(data$Survived)
data$Pclass <- as.factor(data$Pclass)
data$Sex <- as.factor(data$Sex)
data$Embarked <- as.factor(data$Embarked)
str(data)
3 数据分割
library(caret)
set.seed(2019)
intrain <- createDataPartition(data$Survived,p=0.7,list=FALSE)
train <- data[intrain,]
test <- data[-intrain,]
4 建模
set.seed(2020)
model1 <- rpart(Survived~.,train,method='class',parms=list(split = "gini"))
model1
fancyRpartPlot(model1)
p1 <- predict(model1,test,type='class')
p1
#预测准确率计算
A <- as.matrix(table(p1,test$Survived))
acc <- sum(diag(A))/sum(A)
acc # 0.8195489
5 调参
accuracy <- function(min_bucket,max_depth,com){
model <- rpart(Survived ~ ., data = train, method="class",
control=rpart.control(minbucket=min_bucket, maxdepth = max_depth
, cp=com))
pp <- predict(model,test,type="class")
A <- as.matrix(table(pp,test$Survived))
return(sum(diag(A))/sum(A))
}
accu=0
for (i in 1:10){
for (j in seq(1,50,5)){
for (z in round(runif(50,0,0.1),4)){
acc=accuracy(j,i,z)
if (acc>=accu){
accu = acc
print(paste("i的值为:",i))
print(paste("j的值为:",j))
print(paste("z的值为:",z))
print(accu)
}else{
print('NULL')
}
}
}
}
accuracy(6,10,0.0036) # 0.84586
最终得到最佳accuracy的参数:maxdepth=10,minbucket=6,cp=0.0036,accuracy值达到0.84586。
注:本文调参代码为本人原创,缺点在于for循环嵌套时间复杂度过大,如果有BUG或其他方法的小伙伴欢迎私信一起讨论!
python实现
0 导入所需库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
1 导入数据并查看
data = pd.read_csv("taitanic.csv")
data.info()
data.head()
2 数据预处理
#① 删除不用特征
data.drop(["Name","Cabin","Ticket"],inplace=True,axis=1)
data.head()
#② 缺失值填补
data["Age"]=data["Age"].fillna(data["Age"].mean())
data = data.dropna() # 极少缺失值的删除处理
data.info()
#③ 分类型变量文本转数值
data["Embarked"].unique().tolist()
data["Embarked"] = data["Embarked"].map({'S':0,'C':1,'Q':2})
data["Sex"].unique().tolist()
data["Sex"] = data["Sex"].map({'female':0,'male':1})
3 数据分割
y = data["Survived"]
x = data.drop(["Survived"],inplace = False,axis = 1)
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3)
#调整索引:
for i in [xtrain,xtest,ytrain,ytest]:
i.index = range(i.shape[0])
x.head()
4 模型建立
clf = DecisionTreeClassifier(random_state = 17)
clf = clf.fit(xtrain,ytrain)
score = clf.score(xtest,ytest)
score
#交叉验证
score = cross_val_score(clf,x,y,cv=10).mean()
score # 0.7425051072522983
5 调参
Parameters = {"criterion":('gini','entropy')
,"splitter":('best','random')
,"max_depth":[*range(1,10)]
,"min_samples_leaf":[*range(1,50,5)]
,"min_impurity_decrease":[*np.linspace(0,0.5,50)]
}
Clf = DecisionTreeClassifier(random_state=17)
GS=GridSearchCV(clf,parameters,cv=10)
GS=GS.fit(xtrain,ytrain)
GS.best_params_
GS.best_score_ # 0.8183279742765274
(由于没有全部设置随机种子,每次结果有所不同,昨天跑到了0.83但是费时太多)综上所述,R调参费时较少且最优accuracy达到0.84。
注:粗略计算了下,pyhton中调参用时20mins(包含交叉验证),R中2mins(无交叉验证)。
PS.若有问题欢迎讨论指正!