Introduction to Statistical Learning ISLR Chapter 8 Solutions Code, Exercises of Statistics

Tree-Based Methods - Exercise R code as soutution manual ISLR Introduction to Statistical Learning James, Witten, Hastie, Tibshirani

Typology: Exercises

2020/2021

Uploaded on 05/26/2021

ekaksha
ekaksha 🇺🇸

4.4

(30)

268 documents

1 / 4

Toggle sidebar

This page cannot be seen from the preview

Don't miss anything!

bg1
###### Tree ####
library(ISLR)
library(tree)
### Regression Tree ###
data(Hitters)
tree.mod=tree(Salary~Years, data=Hitters)
plot(tree.mod)
text(tree.mod)
### Unpruning tree for Hitters data ####
tree.unpr=tree(Salary~., data=Hitters)
plot(tree.unpr)
text(tree.unpr, pretty=0)
### Estimate the test error with training and test sets ###
set.seed(100)
Hitters=na.omit(Hitters)
train=sample(1:nrow(Hitters), nrow(Hitters)/2)
test=-train
data.train=Hitters[train,]
data.test=Hitters[test,]
tree.mod=tree(Salary~., data=Hitters, subset=train)
yhat=predict(tree.mod, data.test)
test.error=mean((yhat-data.test$Salary)^2)
test.error
### Pruning ####
cv.hitters=cv.tree(tree.mod)
plot(cv.hitters$size, cv.hitters$dev, type="b")
prune.hitters=prune.tree(tree.mod, best=3)
plot(prune.hitters)
text(prune.hitters, pretty=0)
yhat=predict(prune.hitters, data.test)
mean((yhat-data.test$Salary)^2)
##### Bagging & Random Forest ####
library(randomForest)
set.seed(100)
## Bagging ##
bag.hitters=randomForest(Salary~., data=Hitters, subset=train, mtry=19,
importance=TRUE)
bag.hitters
yhat.bag=predict(bag.hitters, newdata=Hitters[-train,])
plot(yhat.bag, data.test$salary)
abline(0,1)
mean((yhat.bag-Hitters[-train,]$Salary)^2)
# the importance plot #
varImpPlot(bag.hitters)
## Random Forest ##
pf3
pf4

Partial preview of the text

Download Introduction to Statistical Learning ISLR Chapter 8 Solutions Code and more Exercises Statistics in PDF only on Docsity!

Tree

library(ISLR) library(tree)

Regression Tree

data(Hitters) tree.mod=tree(Salary~Years, data=Hitters) plot(tree.mod) text(tree.mod)

Unpruning tree for Hitters data

tree.unpr=tree(Salary~., data=Hitters) plot(tree.unpr) text(tree.unpr, pretty=0)

Estimate the test error with training and test sets

set.seed(100) Hitters=na.omit(Hitters) train=sample(1:nrow(Hitters), nrow(Hitters)/2) test=-train data.train=Hitters[train,] data.test=Hitters[test,] tree.mod=tree(Salary~., data=Hitters, subset=train) yhat=predict(tree.mod, data.test) test.error=mean((yhat-data.test$Salary)^2) test.error

Pruning

cv.hitters=cv.tree(tree.mod) plot(cv.hitters$size, cv.hitters$dev, type="b") prune.hitters=prune.tree(tree.mod, best=3) plot(prune.hitters) text(prune.hitters, pretty=0) yhat=predict(prune.hitters, data.test) mean((yhat-data.test$Salary)^2)

Bagging & Random Forest

library(randomForest) set.seed(100)

Bagging

bag.hitters=randomForest(Salary~., data=Hitters, subset=train, mtry=19, importance=TRUE) bag.hitters yhat.bag=predict(bag.hitters, newdata=Hitters[-train,]) plot(yhat.bag, data.test$salary) abline(0,1) mean((yhat.bag-Hitters[-train,]$Salary)^2)

the importance plot

varImpPlot(bag.hitters)

Random Forest

rf.hitters=randomForest(Salary~., data=Hitters, subset=train, mtry=sqrt(19), importance=TRUE) yhat.rf=predict(rf.hitters, newdata=Hitters[-train,]) mean((yhat.rf-Hitters[-train,]$Salary)^2) varImpPlot(rf.hitters) importance(rf.hitters) ######## Boosting ####### library(gbm) set.seed(100) boost.hitters=gbm(Salary~., data=Hitters[train,], distribution="gaussian", n.tree=5000, interaction.depth=4) summary(boost.hitters) yhat.boost=predict(boost.hitters, newdata=Hitters[-train,], n.tree=5000) mean((yhat.boost-Hitters[-train,]$Salary)^2)

can use different values of lambda, default lambda=0.001

############################################################################## ################# Tree (Checkboard Data) ############## ############################################################################## #############################

Data Simulation

############################# library(MASS) set.seed(100) mean.class0.1 <- c(2.5,2.5) mean.class0.2 <- c(7.5,7.5) mean.class1.1 <- c(2.5,7.5) mean.class1.2 <- c(7.5,2.5) sigma.cov <- matrix(c(1,0.2,0.2,4),nrow=2,byrow=T) data.class0.1 <- mvrnorm(20, mean.class0.1,sigma.cov) data.class0.2 <- mvrnorm(20, mean.class0.2,sigma.cov) data.class1.1 <- mvrnorm(20, mean.class1.1,sigma.cov) data.class1.2 <- mvrnorm(20, mean.class1.2,sigma.cov) #data.new <- mvrnorm(4,mean.class0,sigma.cov)

Partition the x space

############################# plot(0,0, xlim=c(x.min-0.5,x.max+0.5),ylim=c(y.min- 0.5,y.max+0.5),type="n",xlab="x1",ylab="x2") points(data.class0.1[,1],data.class0.1[,2],pch="o") points(data.class0.2[,1],data.class0.2[,2],pch="o") points(data.class1.1[,1],data.class1.1[,2],pch="+") points(data.class1.2[,1],data.class1.2[,2],pch="+") #points(data.new[,1],data.new[,2],pch="?") abline(v=8.296) lines(c(0, 8.296),c(4.402,4.402)) lines(c(4.643,4.643),c(-2,4.402)) lines(c(4.531,4.531),c(4.402,14))