##########################################################
#
#   STATS 760: R code for lecture 3
#
##########################################################

# load libraries

library(nnet)
library(rpart)


#  California data

# read in data

URL = "https://www.stat.auckland.ac.nz/~lee/760/houses.data.txt"
houses.df = read.table(URL, header =FALSE,colClasses="numeric")
names(houses.df)=c("medval","medinc", "medage","rooms","bedrooms","pop", "house","lat","long")

# standardize data, using function above (after transforming response to log)

standard.df = houses.df
standard.df[,1]=log(standard.df[,1])
standard.df = data.frame(scale(standard.df))

# now loop over 50 random splits: 15000 training, the rest test

nsplit = 50

# make array to hold results
result = array(0, c(6, 2, nsplit))

for(i in 1:nsplit){
use = sample(dim(standard.df)[1],15000)
training = standard.df[use,]
test=standard.df[-use,]


# lm
model.lm.1 = lm(medval~., data=training)
result[1,1,i] = mean(residuals(model.lm.1)^2)
result[1,2,i]= mean((test$medval - predict(model.lm.1, newdata=test))^2)

# nnet 
model.nn.1 = nnet(medval~medinc+medage+rooms+bedrooms+pop+house+lat+long, data=training, size=1, linout=TRUE,trace=FALSE)
result[2,1,i] = mean(residuals(model.nn.1)^2)
result[2,2,i]= mean((test$medval - predict(model.nn.1, newdata=test))^2)

model.nn.2 = nnet(medval~medinc+medage+rooms+bedrooms+pop+house+lat+long, data=training, size=2,linout=TRUE,trace=FALSE, maxit=200)
result[3,1,i] = mean(residuals(model.nn.2)^2)
result[3,2,i]= mean((test$medval - predict(model.nn.2, newdata=test))^2)

model.nn.4 = nnet(medval~medinc+medage+rooms+bedrooms+pop+house+lat+long, data=training, size=4, linout=TRUE, maxit=500,trace=FALSE)
result[4,1,i] = mean(residuals(model.nn.4)^2)
result[4,2,i]= mean((test$medval - predict(model.nn.4, newdata=test))^2)

# tree in library(rpart)

model.rp.1 = rpart(medval~medinc+medage+rooms+bedrooms+pop+house+lat+long, data=training, method="anova", cp=0.01)
result[5,1,i] = mean(residuals(model.rp.1)^2)
result[5,2,i]= mean((test$medval - predict(model.rp.1, newdata=test))^2)

model.rp.2 = rpart(medval~medinc+medage+rooms+bedrooms+pop+house+lat+long, data=training, method="anova", cp = 0.001)
result[6,1,i] = mean(residuals(model.rp.2)^2)
result[6,2,i]= mean((test$medval - predict(model.rp.2, newdata=test))^2)
print(i)
}

mean.result = apply(result, 1:2, mean)
dimnames(mean.result) = list(c("lm", "nnet1", "nnet2","nnet4", "rpart01", "rpart001"), c("In=sample", "out-of-sample"))
round(mean.result,3)


###########################################################
# package as a function
compare.methods = function(formula, data, response.column, N){
result = array(0,c(3,2,N))
dimnames(result) =list(c("Linear","Tree","Nnet"), c("training","test"))
data1=data

n = dim(data1)[1]
n0=floor(0.75*n)
for( i in 1:N){
ranperm = sample(n,n)
randData=data[ranperm,]
use.fit = (1:n0)
use.pred = ((n0+1):n)
y0 = randData[use.pred,response.column]
X0 =randData[use.pred,-response.column]


model1  = lm(formula, subset = use.fit, data=randData)
result[1,1,i] = mean(residuals(model1)^2)
result[1,2,i] = mean((y0 - predict(model1,newdata=X0))^2) 


mod.tree = rpart(formula, data=randData, subset=use.fit, method="anova", cp=0.0001, minsplit = 5)
result[2,1,i] = mean(residuals(mod.tree)^2)
result[2,2,i] = mean((y0 - predict(mod.tree,newdata=X0))^2)


mod.nnet = nnet(formula, data=randData, subset=use.fit, size=5,linout=TRUE, trace=FALSE)
result[3,1,i] = mean(residuals(mod.nnet)^2)
result[3,2,i] = mean((y0 - predict(mod.nnet,newdata=X0))^2)

}
list(mean=apply(result,c(1,2),mean))

}

#####################################################
# apply to the california data
formula = medval~medinc+medage+rooms+bedrooms+pop+house+lat+long
califResult =compare.methods(formula, standard.df, 1, 50)
> califResult
$mean
        training      test
Linear 0.3562816 0.3591457
Tree   0.3698267 0.3810092
nnet   0.2555022 0.2597914


#####################################################
# and the CPU data
library(MASS)
data(cpus)
cpu = cpus[,-1]
cpu[,7]=log(cpu[,7])
cpuScaled = data.frame(scale(cpu))

formula =  perf ~ syct + mmin + mmax + cach + chmin + chmax
cpuResult =compare.methods(formula, cpuScaled, 7, 50)

> cpuResult
$mean
         training      test
Linear 0.18229167 0.2127489
Tree   0.03498101 0.2092605
Nnet   0.08512439 0.1918876

##########################################################
# bootstrap simulation slide 18

mytable = matrix(0, 3,6)
nrow=1
par(mfrow=c(1,3))
for(n in c(20,50,100)){
nSim = 100
outmat=matrix(0,nSim,5)



# Compute   Ebeta1 etc
nSim1 = 10000
betas = matrix(0,nSim1,2)
for(i in 1:nSim1){
X = data.frame(mvrnorm(n, mu=rep(0,2),Sigma=Sigma))
colnames(X) = c("x","y")
betas[i,] = coef(lm(y~x, data=X))
}

mytable[nrow,6] = 1-2*rho*mean(betas[,2]) + sum (apply(betas^2,2,mean))


for(i in 1:nSim){
X = data.frame(mvrnorm(n, mu=rep(0,2),Sigma=Sigma))
colnames(X) = c("x","y")
fit = lm(y~x, data=X)
err =  mean((X$y-predict(fit))^2)
x = X[,1]
y = X[,2]
theta.fit <- function(x,y){lsfit(x,y)}
theta.predict <- function(fit,x){cbind(1,x)%*%fit$coef}
sq.err <- function(y,yhat) { (y-yhat)^2}
results <- bootpred(x,y,nboot=200,theta.fit,theta.predict,
    err.meas=sq.err)
outmat[i,1:3] = c(results[[1]], results[[1]]+results[[2]], results[[3]])
stuff = crossval(x,y,theta.fit,theta.predict, ngroup=5)
outmat[i,4]=mean((y-stuff$cv.fit)^2)
stuff = crossval(x,y,theta.fit,theta.predict, ngroup=10)
outmat[i,5]=mean((y-stuff$cv.fit)^2)

}

mytable[nrow,1:5]=apply(outmat,2,mean)


# boxplot results

xx = as.vector(outmat)
g = rep(1:5, rep(nSim,5))
boxplot(xx~g, names = c("err", "err+opt", ".632","CV5", "CV10"),
    ylab = "Estimated prediction error",
    ylim = c(0,2), 
    main = paste("Boxplots of estimated PE for n = ",n,",
         with true PE indicated in blue", sep=""))
abline(h=mytable[nrow,6], lwd=2, col="blue")
nrow=nrow+1
}

dimnames(mytable)=list(paste("n =",c(20,50,100)),
   c("err", "err+opt", ".632","CV5", "CV10","True PE"))

 > round(mytable,3)
          err err+opt  .632   CV5  CV10 True PE
n = 20  0.675   0.813 0.829 0.860 0.844   0.834
n = 50  0.698   0.754 0.755 0.764 0.761   0.781
n = 100 0.742   0.771 0.771 0.774 0.773   0.765


#######################################################
# ridge and lasso

######################################################
# some functions

my.lasso.plot=function(x, y, alpha, family="gaussian"){
par(mfrow=c(1,2))

fit=glmnet(x,y,family=family, alpha=alpha,nlambda=50)
fit2 = cv.glmnet(x,y, alpha=0, lambda=fit$lambda)
plot(fit2)
index = order(abs(fit$lambda-fit2$lambda.1se))[1]
smax = if(alpha==0)sum(coef(lm(y~x))^2) else 
        if(alpha==1)sum(abs(coef(lm(y~x)))) else max(sum(coef(lm(y~x))^2), sum(abs(coef(lm(y~x)))))
betamat = as.matrix(fit$beta)
plot(c(0,1.2), range(betamat), type="n", axes=FALSE, xlab = "scaled s", ylab = "coeficients")
box()
axis(1, at = seq(0,1,by=0.2))
axis(2, at = pretty(range(betamat) ))
p = dim(betamat)[1]
my.col = rainbow(p)
s = (1:dim(betamat)[2])/dim(betamat)[2]
for(i in 1:p){
 lines(s, betamat[i,], col = my.col[i] )
 points(s, betamat[i,], pch=19,col = my.col[i])
 text(1, betamat[i,dim(betamat)[2]], colnames(x)[i], pos=4, cex=0.7)
}
abline(v = s[index], lty=2)
}





###################################################

# Example: CPU data

library(MASS)
library(glmnet)
data(cpus)
cpus = scale(cpus[,-c(1,9)])

X = cpus[,-7] # column 7 is the response
y = cpus[,7] 
# Set alpha=0 for ridge plot
my.lasso.plot(X, y, alpha=0, family="gaussian")

# Set alpha=1 for lasso plot
my.lasso.plot(X, y, alpha=0, family="gaussian")

#############################################
# Boosting with trees

library(mboost)
data(bodyfat, package="TH.data")
#############################################
# Boosting with regression as base learner

library(mboost)
library(TH.data)
data(bodyfat, package="TH.data")
# note that linear regression is the default base learner
bf_glm <- glmboost(DEXfat ~ ., data = bodyfat,
     control= boost_control (mstop = 100, nu = 0.1,center = TRUE))
plot(cvrisk(bf_glm))


#############################################
# Boosting with trees as base learner
library(party)
bf_tree <- blackboost(DEXfat ~ ., data = bodyfat,
     control= boost_control (mstop = 100, nu = 0.1,center = TRUE),
     tree_controls = ctree_control(maxdepth = 3))
plot(cvrisk(bf_tree))

#############################################
# Random Forest

library(randomForest)
myfit.rf=randomForest(DEXfat ~ ., data=bodyfat, ntree=100,
mtry=2, maxnodes = 4, importance=TRUE)
plot(myfit.rf)

##########################################
# california data

URL = "https://www.stat.auckland.ac.nz/~lee/760/houses.data.txt"
houses.df = read.table(URL, header =FALSE,colClasses="numeric")
names(houses.df)=c("medval","medinc", "medage","rooms","bedrooms","pop", "house","lat","long")

# standardize data, using function above (after transforming response to log)

standard.df = houses.df
standard.df[,1]=log(standard.df[,1])
standard.df = data.frame(scale(standard.df))

Do a random split: 15000 training, the rest test
use = sample(dim(standard.df)[1],15000)
training = standard.df[use,]
test=standard.df[-use,]

calif.rf=randomForest(medval ~ ., data=training, ntree=200,
mtry=5, importance=TRUE)
plot(calif.rf)
mean((training$medval-predict(calif.rf))^2)
mean((test$medval-predict(calif.rf, newdata=test))^2)

###############################################
# use of caret

library(caret)
library(MASS)
data(cpus)
library(rpart)

my.grid <- expand.grid(.decay = c(0.001), .size = 2:10)
cpu = cpus[,-1]
cpu[,7]=log(cpu[,7])
cpuScaled = data.frame(scale(cpu))
formula =  perf ~ syct + mmin + mmax + cach + chmin + chmax

nnCV <- train(formula, data = cpuScaled,
 method = "nnet", 
    maxit = 1000, 
    tuneGrid = my.grid, 
    trace = FALSE, 
    linout = TRUE,
    trControl = trainControl(method="cv", number=5, 
       repeats=1000))