Variable Selection
Variable Selection
20
regfit.full=regsubsets (Salary~.,Hitters,nvmax=19)
reg.summary<-summary(regfit.full)
names(reg.summary)
## [1] "which"
"rsq"
"rss"
"adjr2"
"cp"
"bic"
"outmat" "obj"
RSS
plot(reg.summary$rss,xlab="Number of Variables",ylab="RSS",type="l")
3.6e+07
3.2e+07
2.8e+07
RSS
2.4e+07
10
15
Number of Variables
Adjusted R-squared
which.max(reg.summary$adjr2)
## [1] 11
plot(reg.summary$adjr2,xlab ="Number of Variables",ylab="Adjusted RSq",type="l")
points(11,reg.summary$adjr2[11],col="red",cex=2, pch=20)
0.50
0.45
0.40
0.35
Adjusted RSq
10
15
Number of Variables
Cp
which.min(reg.summary$cp)
## [1] 10
plot(reg.summary$cp,xlab ="Number of Variables",ylab="Cp",type="l")
points(10,reg.summary$cp[10],col ="red",cex=2, pch=20)
100
80
60
20
40
Cp
10
15
Number of Variables
BIC
which.min(reg.summary$bic)
## [1] 6
plot(reg.summary$bic,xlab="Number of Variables",ylab="BIC",type="l")
points (6,reg.summary$bic[6],col ="red",cex=2,pch=20)
90
110
150
130
BIC
10
Number of Variables
Selected Variables
plot(regfit.full,scale ="r2")
15
(Intercept)
AtBat
Hits
HmRun
Runs
RBI
Walks
Years
CAtBat
CHits
CHmRun
CRuns
CRBI
CWalks
LeagueN
DivisionW
PutOuts
Assists
Errors
NewLeagueN
r2
0.55
0.55
0.55
0.55
0.55
0.55
0.54
0.54
0.54
0.54
0.53
0.53
0.51
0.51
0.49
0.48
0.45
0.43
0.32
plot(regfit.full,scale ="adjr2")
(Intercept)
AtBat
Hits
HmRun
Runs
RBI
Walks
Years
CAtBat
CHits
CHmRun
CRuns
CRBI
CWalks
LeagueN
DivisionW
PutOuts
Assists
Errors
NewLeagueN
adjr2
0.52
0.52
0.52
0.52
0.52
0.52
0.52
0.52
0.51
0.51
0.51
0.51
0.5
0.5
0.48
0.47
0.45
0.42
0.32
plot(regfit.full,scale ="Cp")
(Intercept)
AtBat
Hits
HmRun
Runs
RBI
Walks
Years
CAtBat
CHits
CHmRun
CRuns
CRBI
CWalks
LeagueN
DivisionW
PutOuts
Assists
Errors
NewLeagueN
Cp
5
5.9
6.2
7.3
7.4
8.9
10
12
13
14
14
16
18
20
22
28
39
51
100
plot(regfit.full,scale ="bic")
(Intercept)
AtBat
Hits
HmRun
Runs
RBI
Walks
Years
CAtBat
CHits
CHmRun
CRuns
CRBI
CWalks
LeagueN
DivisionW
PutOuts
Assists
Errors
NewLeagueN
bic
150
150
150
150
140
140
140
140
140
130
130
130
120
120
110
110
100
96
91
AtBat
-1.8685892
PutOuts
0.2643076
Hits
7.6043976
Walks
3.6976468
CRBI
0.6430169
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
Years
FALSE
FALSE
CAtBat
FALSE
FALSE
CHits
FALSE
FALSE
CHmRun
FALSE
FALSE
CRuns
FALSE
FALSE
CRBI
FALSE
FALSE
CWalks
FALSE
FALSE
LeagueN
FALSE
FALSE
DivisionW
FALSE
FALSE
PutOuts
FALSE
FALSE
Assists
FALSE
FALSE
Errors
FALSE
FALSE
NewLeagueN
FALSE
FALSE
1 subsets of each size up to 19
Selection Algorithm: forward
AtBat Hits HmRun Runs RBI Walks Years
1 ( 1 ) " "
" " " "
" " " " " "
" "
2 ( 1 ) " "
"*" " "
" " " " " "
" "
3 ( 1 ) " "
"*" " "
" " " " " "
" "
4 ( 1 ) " "
"*" " "
" " " " " "
" "
5 ( 1 ) "*"
"*" " "
" " " " " "
" "
6 ( 1 ) "*"
"*" " "
" " " " "*"
" "
7 ( 1 ) "*"
"*" " "
" " " " "*"
" "
8 ( 1 ) "*"
"*" " "
" " " " "*"
" "
9 ( 1 ) "*"
"*" " "
" " " " "*"
" "
10 ( 1 ) "*"
"*" " "
" " " " "*"
" "
11 ( 1 ) "*"
"*" " "
" " " " "*"
" "
12 ( 1 ) "*"
"*" " "
"*" " " "*"
" "
13 ( 1 ) "*"
"*" " "
"*" " " "*"
" "
14 ( 1 ) "*"
"*" "*"
"*" " " "*"
" "
15 ( 1 ) "*"
"*" "*"
"*" " " "*"
" "
16 ( 1 ) "*"
"*" "*"
"*" "*" "*"
" "
17 ( 1 ) "*"
"*" "*"
"*" "*" "*"
" "
18 ( 1 ) "*"
"*" "*"
"*" "*" "*"
"*"
19 ( 1 ) "*"
"*" "*"
"*" "*" "*"
"*"
CRBI CWalks LeagueN DivisionW PutOuts
1 ( 1 ) "*" " "
" "
" "
" "
2 ( 1 ) "*" " "
" "
" "
" "
3 ( 1 ) "*" " "
" "
" "
"*"
4 ( 1 ) "*" " "
" "
"*"
"*"
5 ( 1 ) "*" " "
" "
"*"
"*"
6 ( 1 ) "*" " "
" "
"*"
"*"
7 ( 1 ) "*" "*"
" "
"*"
"*"
8 ( 1 ) "*" "*"
" "
"*"
"*"
9 ( 1 ) "*" "*"
" "
"*"
"*"
10 ( 1 ) "*" "*"
" "
"*"
"*"
11 ( 1 ) "*" "*"
"*"
"*"
"*"
12 ( 1 ) "*" "*"
"*"
"*"
"*"
13 ( 1 ) "*" "*"
"*"
"*"
"*"
14 ( 1 ) "*" "*"
"*"
"*"
"*"
15 ( 1 ) "*" "*"
"*"
"*"
"*"
16 ( 1 ) "*" "*"
"*"
"*"
"*"
17 ( 1 ) "*" "*"
"*"
"*"
"*"
18 ( 1 ) "*" "*"
"*"
"*"
"*"
10
## 19
( 1 ) "*"
"*"
"*"
"*"
"*"
"*"
"*"
"*"
150
150
150
140
140
140
140
140
140
130
130
130
120
120
110
110
100
96
91
(Intercept)
AtBat
Hits
HmRun
Runs
RBI
Walks
Years
CAtBat
CHits
CHmRun
CRuns
CRBI
CWalks
LeagueN
DivisionW
PutOuts
Assists
Errors
NewLeagueN
bic
plot(regfit.fwd,scale="bic")
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
CHmRun
FALSE
FALSE
CRuns
FALSE
FALSE
CRBI
FALSE
FALSE
CWalks
FALSE
FALSE
LeagueN
FALSE
FALSE
DivisionW
FALSE
FALSE
PutOuts
FALSE
FALSE
Assists
FALSE
FALSE
Errors
FALSE
FALSE
NewLeagueN
FALSE
FALSE
1 subsets of each size up to 19
Selection Algorithm: backward
AtBat Hits HmRun Runs RBI Walks Years
1 ( 1 ) " "
" " " "
" " " " " "
" "
2 ( 1 ) " "
"*" " "
" " " " " "
" "
3 ( 1 ) " "
"*" " "
" " " " " "
" "
4 ( 1 ) "*"
"*" " "
" " " " " "
" "
5 ( 1 ) "*"
"*" " "
" " " " "*"
" "
6 ( 1 ) "*"
"*" " "
" " " " "*"
" "
7 ( 1 ) "*"
"*" " "
" " " " "*"
" "
8 ( 1 ) "*"
"*" " "
" " " " "*"
" "
9 ( 1 ) "*"
"*" " "
" " " " "*"
" "
10 ( 1 ) "*"
"*" " "
" " " " "*"
" "
11 ( 1 ) "*"
"*" " "
" " " " "*"
" "
12 ( 1 ) "*"
"*" " "
"*" " " "*"
" "
13 ( 1 ) "*"
"*" " "
"*" " " "*"
" "
14 ( 1 ) "*"
"*" "*"
"*" " " "*"
" "
15 ( 1 ) "*"
"*" "*"
"*" " " "*"
" "
16 ( 1 ) "*"
"*" "*"
"*" "*" "*"
" "
17 ( 1 ) "*"
"*" "*"
"*" "*" "*"
" "
18 ( 1 ) "*"
"*" "*"
"*" "*" "*"
"*"
19 ( 1 ) "*"
"*" "*"
"*" "*" "*"
"*"
CRBI CWalks LeagueN DivisionW PutOuts
1 ( 1 ) " " " "
" "
" "
" "
2 ( 1 ) " " " "
" "
" "
" "
3 ( 1 ) " " " "
" "
" "
"*"
4 ( 1 ) " " " "
" "
" "
"*"
5 ( 1 ) " " " "
" "
" "
"*"
6 ( 1 ) " " " "
" "
"*"
"*"
7 ( 1 ) " " "*"
" "
"*"
"*"
8 ( 1 ) "*" "*"
" "
"*"
"*"
9 ( 1 ) "*" "*"
" "
"*"
"*"
10 ( 1 ) "*" "*"
" "
"*"
"*"
11 ( 1 ) "*" "*"
"*"
"*"
"*"
12 ( 1 ) "*" "*"
"*"
"*"
"*"
13 ( 1 ) "*" "*"
"*"
"*"
"*"
14 ( 1 ) "*" "*"
"*"
"*"
"*"
15 ( 1 ) "*" "*"
"*"
"*"
"*"
16 ( 1 ) "*" "*"
"*"
"*"
"*"
17 ( 1 ) "*" "*"
"*"
"*"
"*"
18 ( 1 ) "*" "*"
"*"
"*"
"*"
19 ( 1 ) "*" "*"
"*"
"*"
"*"
12
150
150
140
140
140
140
140
140
130
130
130
120
120
120
110
110
100
96
89
(Intercept)
AtBat
Hits
HmRun
Runs
RBI
Walks
Years
CAtBat
CHits
CHmRun
CRuns
CRBI
CWalks
LeagueN
DivisionW
PutOuts
Assists
Errors
NewLeagueN
bic
plot(regfit.bwd,scale="bic")
13
which.min(val.errors)
## [1] 10
coef(regfit.best,10)
## (Intercept)
## -80.2751499
##
CHmRun
##
1.3844863
AtBat
-1.4683816
CWalks
-0.7483170
Hits
Walks
7.1625314
3.6430345
LeagueN
DivisionW
84.5576103 -53.0289658
CAtBat
-0.1855698
PutOuts
0.2381662
CHits
1.1053238
(Intercept)
162.5354420
CRuns
1.4082490
Assists
0.2831680
AtBat
-2.1686501
CRBI
0.7743122
Hits
Walks
6.9180175
5.7732246
CWalks
DivisionW
-0.8308264 -112.3800575
CAtBat
-0.1300798
PutOuts
0.2973726
Cross-Validation
predict.regsubsets =function(object ,newdata ,id ,...){
form=as.formula(object$call [[2]])
mat=model.matrix(form ,newdata )
coefi=coef(object,id=id)
xvars=names(coefi)
mat[,xvars]%*% coefi
}
k=10
set.seed(1)
folds=sample(1:k,nrow(Hitters),replace =TRUE)
cv.errors=matrix(NA,k,19,dimnames=list(paste(1:k), paste(1:19)))
for(j in 1:k){
best.fit =regsubsets(Salary~.,data=Hitters[folds!=j,],nvmax =19)
for(i in 1:19) {
pred=predict(best.fit ,Hitters[folds==j,],id=i)
cv.errors [j,i]=mean( (Hitters$Salary[folds ==j]-pred)^2)
}
}
mean.cv.errors=apply(cv.errors ,2, mean)
mean.cv.errors
##
1
2
3
4
5
6
7
8
## 160093.5 140196.8 153117.0 151159.3 146841.3 138302.6 144346.2 130207.7
##
9
10
11
12
13
14
15
16
## 129459.6 125334.7 125153.8 128273.5 133461.0 133974.6 131825.7 131882.8
##
17
18
19
## 132750.9 133096.2 132804.7
14
mean.cv.errors
plot(mean.cv.errors,type="b")
10
15
Index
reg.best=regsubsets(Salary~.,data=Hitters,nvmax =19)
coef(reg.best ,11)
##
##
##
##
##
##
(Intercept)
135.7512195
CRuns
1.4553310
PutOuts
0.2894087
AtBat
-2.1277482
CRBI
0.7852528
Assists
0.2688277
Hits
6.9236994
CWalks
-0.8228559
Walks
CAtBat
5.6202755
-0.1389914
LeagueN
DivisionW
43.1116152 -111.1460252
Reference:
James, Gareth, et al. An introduction to statistical learning. New
York: springer, 2013.
15