########################################################## # # STATS 760: R code for lecture 3 # ########################################################## # load libraries library(nnet) library(rpart) # California data # read in data URL = "https://www.stat.auckland.ac.nz/~lee/760/houses.data.txt" houses.df = read.table(URL, header =FALSE,colClasses="numeric") names(houses.df)=c("medval","medinc", "medage","rooms","bedrooms","pop", "house","lat","long") # standardize data, using function above (after transforming response to log) standard.df = houses.df standard.df[,1]=log(standard.df[,1]) standard.df = data.frame(scale(standard.df)) # now loop over 50 random splits: 15000 training, the rest test nsplit = 50 # make array to hold results result = array(0, c(6, 2, nsplit)) for(i in 1:nsplit){ use = sample(dim(standard.df)[1],15000) training = standard.df[use,] test=standard.df[-use,] # lm model.lm.1 = lm(medval~., data=training) result[1,1,i] = mean(residuals(model.lm.1)^2) result[1,2,i]= mean((test$medval - predict(model.lm.1, newdata=test))^2) # nnet model.nn.1 = nnet(medval~medinc+medage+rooms+bedrooms+pop+house+lat+long, data=training, size=1, linout=TRUE,trace=FALSE) result[2,1,i] = mean(residuals(model.nn.1)^2) result[2,2,i]= mean((test$medval - predict(model.nn.1, newdata=test))^2) model.nn.2 = nnet(medval~medinc+medage+rooms+bedrooms+pop+house+lat+long, data=training, size=2,linout=TRUE,trace=FALSE, maxit=200) result[3,1,i] = mean(residuals(model.nn.2)^2) result[3,2,i]= mean((test$medval - predict(model.nn.2, newdata=test))^2) model.nn.4 = nnet(medval~medinc+medage+rooms+bedrooms+pop+house+lat+long, data=training, size=4, linout=TRUE, maxit=500,trace=FALSE) result[4,1,i] = mean(residuals(model.nn.4)^2) result[4,2,i]= mean((test$medval - predict(model.nn.4, newdata=test))^2) # tree in library(rpart) model.rp.1 = rpart(medval~medinc+medage+rooms+bedrooms+pop+house+lat+long, data=training, method="anova", cp=0.01) result[5,1,i] = mean(residuals(model.rp.1)^2) result[5,2,i]= mean((test$medval - predict(model.rp.1, newdata=test))^2) model.rp.2 = rpart(medval~medinc+medage+rooms+bedrooms+pop+house+lat+long, data=training, method="anova", cp = 0.001) result[6,1,i] = mean(residuals(model.rp.2)^2) result[6,2,i]= mean((test$medval - predict(model.rp.2, newdata=test))^2) print(i) } mean.result = apply(result, 1:2, mean) dimnames(mean.result) = list(c("lm", "nnet1", "nnet2","nnet4", "rpart01", "rpart001"), c("In=sample", "out-of-sample")) round(mean.result,3) ########################################################### # package as a function compare.methods = function(formula, data, response.column, N){ result = array(0,c(3,2,N)) dimnames(result) =list(c("Linear","Tree","Nnet"), c("training","test")) data1=data n = dim(data1)[1] n0=floor(0.75*n) for( i in 1:N){ ranperm = sample(n,n) randData=data[ranperm,] use.fit = (1:n0) use.pred = ((n0+1):n) y0 = randData[use.pred,response.column] X0 =randData[use.pred,-response.column] model1 = lm(formula, subset = use.fit, data=randData) result[1,1,i] = mean(residuals(model1)^2) result[1,2,i] = mean((y0 - predict(model1,newdata=X0))^2) mod.tree = rpart(formula, data=randData, subset=use.fit, method="anova", cp=0.0001, minsplit = 5) result[2,1,i] = mean(residuals(mod.tree)^2) result[2,2,i] = mean((y0 - predict(mod.tree,newdata=X0))^2) mod.nnet = nnet(formula, data=randData, subset=use.fit, size=5,linout=TRUE, trace=FALSE) result[3,1,i] = mean(residuals(mod.nnet)^2) result[3,2,i] = mean((y0 - predict(mod.nnet,newdata=X0))^2) } list(mean=apply(result,c(1,2),mean)) } ##################################################### # apply to the california data formula = medval~medinc+medage+rooms+bedrooms+pop+house+lat+long califResult =compare.methods(formula, standard.df, 1, 50) > califResult $mean training test Linear 0.3562816 0.3591457 Tree 0.3698267 0.3810092 nnet 0.2555022 0.2597914 ##################################################### # and the CPU data library(MASS) data(cpus) cpu = cpus[,-1] cpu[,7]=log(cpu[,7]) cpuScaled = data.frame(scale(cpu)) formula = perf ~ syct + mmin + mmax + cach + chmin + chmax cpuResult =compare.methods(formula, cpuScaled, 7, 50) > cpuResult $mean training test Linear 0.18229167 0.2127489 Tree 0.03498101 0.2092605 Nnet 0.08512439 0.1918876 ########################################################## # bootstrap simulation slide 18 mytable = matrix(0, 3,6) nrow=1 par(mfrow=c(1,3)) for(n in c(20,50,100)){ nSim = 100 outmat=matrix(0,nSim,5) # Compute Ebeta1 etc nSim1 = 10000 betas = matrix(0,nSim1,2) for(i in 1:nSim1){ X = data.frame(mvrnorm(n, mu=rep(0,2),Sigma=Sigma)) colnames(X) = c("x","y") betas[i,] = coef(lm(y~x, data=X)) } mytable[nrow,6] = 1-2*rho*mean(betas[,2]) + sum (apply(betas^2,2,mean)) for(i in 1:nSim){ X = data.frame(mvrnorm(n, mu=rep(0,2),Sigma=Sigma)) colnames(X) = c("x","y") fit = lm(y~x, data=X) err = mean((X$y-predict(fit))^2) x = X[,1] y = X[,2] theta.fit <- function(x,y){lsfit(x,y)} theta.predict <- function(fit,x){cbind(1,x)%*%fit$coef} sq.err <- function(y,yhat) { (y-yhat)^2} results <- bootpred(x,y,nboot=200,theta.fit,theta.predict, err.meas=sq.err) outmat[i,1:3] = c(results[[1]], results[[1]]+results[[2]], results[[3]]) stuff = crossval(x,y,theta.fit,theta.predict, ngroup=5) outmat[i,4]=mean((y-stuff$cv.fit)^2) stuff = crossval(x,y,theta.fit,theta.predict, ngroup=10) outmat[i,5]=mean((y-stuff$cv.fit)^2) } mytable[nrow,1:5]=apply(outmat,2,mean) # boxplot results xx = as.vector(outmat) g = rep(1:5, rep(nSim,5)) boxplot(xx~g, names = c("err", "err+opt", ".632","CV5", "CV10"), ylab = "Estimated prediction error", ylim = c(0,2), main = paste("Boxplots of estimated PE for n = ",n,", with true PE indicated in blue", sep="")) abline(h=mytable[nrow,6], lwd=2, col="blue") nrow=nrow+1 } dimnames(mytable)=list(paste("n =",c(20,50,100)), c("err", "err+opt", ".632","CV5", "CV10","True PE")) > round(mytable,3) err err+opt .632 CV5 CV10 True PE n = 20 0.675 0.813 0.829 0.860 0.844 0.834 n = 50 0.698 0.754 0.755 0.764 0.761 0.781 n = 100 0.742 0.771 0.771 0.774 0.773 0.765 ####################################################### # ridge and lasso ###################################################### # some functions my.lasso.plot=function(x, y, alpha, family="gaussian"){ par(mfrow=c(1,2)) fit=glmnet(x,y,family=family, alpha=alpha,nlambda=50) fit2 = cv.glmnet(x,y, alpha=0, lambda=fit$lambda) plot(fit2) index = order(abs(fit$lambda-fit2$lambda.1se))[1] smax = if(alpha==0)sum(coef(lm(y~x))^2) else if(alpha==1)sum(abs(coef(lm(y~x)))) else max(sum(coef(lm(y~x))^2), sum(abs(coef(lm(y~x))))) betamat = as.matrix(fit$beta) plot(c(0,1.2), range(betamat), type="n", axes=FALSE, xlab = "scaled s", ylab = "coeficients") box() axis(1, at = seq(0,1,by=0.2)) axis(2, at = pretty(range(betamat) )) p = dim(betamat)[1] my.col = rainbow(p) s = (1:dim(betamat)[2])/dim(betamat)[2] for(i in 1:p){ lines(s, betamat[i,], col = my.col[i] ) points(s, betamat[i,], pch=19,col = my.col[i]) text(1, betamat[i,dim(betamat)[2]], colnames(x)[i], pos=4, cex=0.7) } abline(v = s[index], lty=2) } ################################################### # Example: CPU data library(MASS) library(glmnet) data(cpus) cpus = scale(cpus[,-c(1,9)]) X = cpus[,-7] # column 7 is the response y = cpus[,7] # Set alpha=0 for ridge plot my.lasso.plot(X, y, alpha=0, family="gaussian") # Set alpha=1 for lasso plot my.lasso.plot(X, y, alpha=0, family="gaussian") ############################################# # Boosting with trees library(mboost) data(bodyfat, package="TH.data") ############################################# # Boosting with regression as base learner library(mboost) library(TH.data) data(bodyfat, package="TH.data") # note that linear regression is the default base learner bf_glm <- glmboost(DEXfat ~ ., data = bodyfat, control= boost_control (mstop = 100, nu = 0.1,center = TRUE)) plot(cvrisk(bf_glm)) ############################################# # Boosting with trees as base learner library(party) bf_tree <- blackboost(DEXfat ~ ., data = bodyfat, control= boost_control (mstop = 100, nu = 0.1,center = TRUE), tree_controls = ctree_control(maxdepth = 3)) plot(cvrisk(bf_tree)) ############################################# # Random Forest library(randomForest) myfit.rf=randomForest(DEXfat ~ ., data=bodyfat, ntree=100, mtry=2, maxnodes = 4, importance=TRUE) plot(myfit.rf) ########################################## # california data URL = "https://www.stat.auckland.ac.nz/~lee/760/houses.data.txt" houses.df = read.table(URL, header =FALSE,colClasses="numeric") names(houses.df)=c("medval","medinc", "medage","rooms","bedrooms","pop", "house","lat","long") # standardize data, using function above (after transforming response to log) standard.df = houses.df standard.df[,1]=log(standard.df[,1]) standard.df = data.frame(scale(standard.df)) Do a random split: 15000 training, the rest test use = sample(dim(standard.df)[1],15000) training = standard.df[use,] test=standard.df[-use,] calif.rf=randomForest(medval ~ ., data=training, ntree=200, mtry=5, importance=TRUE) plot(calif.rf) mean((training$medval-predict(calif.rf))^2) mean((test$medval-predict(calif.rf, newdata=test))^2) ############################################### # use of caret library(caret) library(MASS) data(cpus) library(rpart) my.grid <- expand.grid(.decay = c(0.001), .size = 2:10) cpu = cpus[,-1] cpu[,7]=log(cpu[,7]) cpuScaled = data.frame(scale(cpu)) formula = perf ~ syct + mmin + mmax + cach + chmin + chmax nnCV <- train(formula, data = cpuScaled, method = "nnet", maxit = 1000, tuneGrid = my.grid, trace = FALSE, linout = TRUE, trControl = trainControl(method="cv", number=5, repeats=1000))