5、再采样方法
看了中大一位博主的文章,其中提到《an introduction to statistical learning with R》这本书,粗略看了下很受启发,本书可以作为统计学专业硕士自学教材。由于本人最近在做变量选择相关研究,于是从第五章开始学习。
此处主要给出书中样例R语言代码实现,一些解释也在代码注释中,详细内容参见教材。
再采样方法主要分为交叉验证和自助法。
交叉验证
5.1验证集方法
library(ISLR)
set.seed(1)
train <- sample(392,196)
#使用训练集拟合线性模型
lm.fit <- lm(mpg~horsepower, data = Auto, subset = trian)
attach(Auto)#指定了这个数据集中的变量
#验证集的MSE
valmse1 <- mean((mpg-predict(lm.fit,Auto))[-train]^2)
#使用训练集拟合二次三次四次多项式模型
lm.fit2 <- lm(mpg~poly(horsepower,2),data = Auto,subset = train)
valmse2 <- mean((mpg-predict(lm.fit2,Auto))[-train]^2)
lm.fit3 <- lm(mpg~poly(horsepower,3),data = Auto,subset = train)
valmse3 <- mean((mpg-predict(lm.fit3,Auto))[-train]^2)
lm.fit4 <- lm(mpg~poly(horsepower,4),data = Auto,subset = train)
valmse4 <- mean((mpg-predict(lm.fit4,Auto))[-train]^2)
color <- palette(rainbow(10))
x <- c(1,2,3,4)
y<- c(valmse1,valmse2,valmse3,valmse4)
plot(x,y,ylim = c(10,30),type = 'b',xlab = 'flexibility',ylab = 'MSE',col = color[1])
#设置10组不同种子模拟
for (i in 1:9) {
set.seed(1+i)
train <- sample(392,196)
#使用训练集拟合线性模型
lm.fit <- lm(mpg~horsepower, data = Auto, subset = trian)
attach(Auto)#指定了这个数据集中的变量
#验证集的MSE
valmse1 <- mean((mpg-predict(lm.fit,Auto))[-train]^2)
#使用训练集拟合二次三次四次多项式模型
lm.fit2 <- lm(mpg~poly(horsepower,2),data = Auto,subset = train)
valmse2 <- mean((mpg-predict(lm.fit2,Auto))[-train]^2)
lm.fit3 <- lm(mpg~poly(horsepower,3),data = Auto,subset = train)
valmse3 <- mean((mpg-predict(lm.fit3,Auto))[-train]^2)
lm.fit4 <- lm(mpg~poly(horsepower,4),data = Auto,subset = train)
valmse4 <- mean((mpg-predict(lm.fit4,Auto))[-train]^2)
x <- c(1,2,3,4)
y<- c(valmse1,valmse2,valmse3,valmse4)
lines(x,y,type = 'b',col = color[i+1])
}
![](https://img.laitimes.com/img/9ZDMuAjOiMmIsIjOiQnIsIiclRnblN2XjlGcjAzNfRHLGZkRGZkRfJ3bs92YsYTMfVmepNHL9cGVP5WNXl1bk1mYohmMMBjVtJWd0ckW65UbM5WOHJWa5kHT20ESjBjUIF2X0hXZ0xCMx81dvRWYoNHLrdEZwZ1Rh5WNXp1bwNjW1ZUba9VZwlHdssmch1mclRXY39CXldWYtlWPzNXZj9mcw1ycz9WL49zZuBnL4AzM2ATM1gTMzATMxAjMwIzLc52YucWbp5GZzNmLn9Gbi1yZtl2Lc9CX6MHc0RHaiojIsJye.png)
5.2 留一交叉验证
glm.fit <- glm(mpg~horsepower,data = Auto)
coef(glm.fit)
lm.fit <- lm(mpg~horsepower,data = Auto)
coef(lm.fit)
#由于线性模型与广义线性模型估计的参数一致,为了便于使用CV,此处采用广义线性模型
library(boot)
glm.fit <- glm(mpg~horsepower,data = Auto)
cv.err <- cv.glm(Auto,glm.fit)
cv.err$delta
cv.error <- rep(0,5)
for (i in 1:5) {
glm.fit <- glm(mpg~poly(horsepower,i),data = Auto)
cv.error[i] <- cv.glm(Auto, glm.fit)$delta[1]
}
x <- c(1,2,3,4,5)
plot(x,cv.error,xlab = 'flexibility', ylab = 'MSE',type = 'b')
5.3 k折交叉验证
#k的含义是将数据分成k组,重复做k组
set.seed(17)
cv.error.10 <- rep(0,10)
for (i in 1:10) {
glm.fit <- glm(mpg~poly(horsepower,i),data = Auto)
cv.error.10[i] <- cv.glm(Auto,glm.fit,K=10)$delta[1]
}
#cv.error.10的第i个元素表示使用10折交叉验证拟合i次多项式,得到的MSE
x <- c(1:10)
plot(x,cv.error.10,xlab = 'flexibility', ylab = 'MSE',type = 'b',col=color[1])
#设置不同的种子
for (j in 1:6){
set.seed(17+j)
cv.error.10 <- rep(0,10)
for (i in 1:10) {
glm.fit <- glm(mpg~poly(horsepower,i),data = Auto)
cv.error.10[i] <- cv.glm(Auto,glm.fit,K=10)$delta[1]
}
lines(x,cv.error.10,type = 'b',col = color[j+1])
}
自助法
5.4.1 估计参数的精度
#首先定义一个要评估的统计量
#使得var(a*X+(1-a)*Y)取最小值的a就是要估计的参数
alpha.fn <- function(data,index){
X <- data$X[index]
Y <- data$Y[index]
return((var(Y)-cov(X,Y))/(var(X)+var(Y)-2*cov(X,Y)))
}
#使用Portfolio数据及中前100个样本
alpha.fn(Portfolio,1:100)#返回的是a的估计值
set.seed(1)
alpha.fn(Portfolio,sample(100,100,replace = T))
#自助抽取样本大小等于原始数据大小的样本,重复1000次
#Bootstrap B=1000
boot(Portfolio,alpha.fn,R=1000)
ORDINARY NONPARAMETRIC BOOTSTRAP
Call:
boot(data = Portfolio, statistic = alpha.fn, R = 1000)
Bootstrap Statistics :
original bias std. error
t1* 0.5758321 -0.001695873 0.09366347
可以知道由自助采样法(B=1000)得到a的估计值为0.576
a的估计值的标准差为0.094
5.4.2 估计一个线性模型的精度
boot.fn <- function(data,index){
return(coef(lm(mpg~horsepower,data = data,subset = index)))
}
boot.fn(Auto,1:392)
(Intercept) horsepower
39.9358610 -0.1578447
set.seed(1)
boot.fn(Auto,sample(392,392,replace = T))
(Intercept) horsepower
40.3404517 -0.1634868
> boot(Auto,boot.fn,1000)
ORDINARY NONPARAMETRIC BOOTSTRAP
Call:
boot(data = Auto, statistic = boot.fn, R = 1000)
Bootstrap Statistics :
original bias std. error
t1* 39.9358610 0.0549915227 0.841925746
t2* -0.1578447 -0.0006210818 0.007348956
可以知道由自助法进行线性模型拟合得到的参数估计值:
belta1 = -0.158,belta0 = 39.936
参数估计值的标准误差分为为:
SE(belta1) = 0.842,SE(belta0) = 0.007
> summary(lm(mpg~horsepower,data = Auto))$coef
Estimate Std. Error t value Pr(>|t|)
(Intercept) 39.9358610 0.717498656 55.65984 1.220362e-187
horsepower -0.1578447 0.006445501 -24.48914 7.031989e-81
直接由最小二乘得到参数估计值为:
belta1 = -0.158,belta0 = 39.936
参数估计值的标准误差分为为:
SE(belta1) = 0.717,SE(belta0) = 0.006
5.4.2 估计一个二次多项式模型的精度
自助法估计
boot.fn <- function(data,index)
{
coefficients(lm(mpg~horsepower+I(horsepower^2),data = data,subset = index))
}
set.seed(1)
boot(Auto,boot.fn,1000)
ORDINARY NONPARAMETRIC BOOTSTRAP
Call:
boot(data = Auto, statistic = boot.fn, R = 1000)
Bootstrap Statistics :
original bias std. error
t1* 56.900099702 3.511640e-02 2.0300222526
t2* -0.466189630 -7.080834e-04 0.0324241984
t3* 0.001230536 2.840324e-06 0.0001172164
普通二次多项式估计
> summary(lm(mpg~horsepower+I(horsepower^2),data = Auto))$coef
Estimate Std. Error t value Pr(>|t|)
(Intercept) 56.900099702 1.8004268063 31.60367 1.740911e-109
horsepower -0.466189630 0.0311246171 -14.97816 2.289429e-40
I(horsepower^2) 0.001230536 0.0001220759 10.08009 2.196340e-21