#-*- R -*- # Chapter 6 Linear Statistical Models library(MASS) postscript(file="ch06.ps", width=8, height=6, pointsize=9) options(contrasts=c("contr.helmert", "contr.poly")) # 6.1 A linear regression example data(whiteside) if(require(lattice)) { ## can run this is lattice is available trellis.device(postscript, file="ch06b.ps", width=8, height=6, pointsize=9) print(xyplot(Gas ~ Temp | Insul, whiteside, panel = function(x, y, ...) { panel.xyplot(x, y, ...) panel.lmline(x, y, ...) }, xlab = "Average external temperature (deg. C)", ylab = "Gas consumption (1000 cubic feet)")) dev.off() } else { coplot(Gas ~ Temp | Insul, whiteside, panel = function(x, y, ...) { points(x, y, ...) abline(lm(y ~ x)) }, xlab = "Average external temperature (deg. C)", ylab = "Gas consumption (1000 cubic feet)") } gasB <- lm(Gas ~ Temp, whiteside, subset = Insul=="Before") gasA <- update(gasB, subset = Insul=="After") summary(gasB) summary(gasA) varB <- deviance(gasB)/gasB$df.resid # direct calculation varB <- summary(gasB)$sigma^2 # alternative gasBA <- lm(Gas ~ Insul/Temp - 1, whiteside) summary(gasBA) gasQ <- lm(Gas ~ Insul/(Temp + I(Temp^2)) - 1, whiteside) summary(gasQ)$coef gasPR <- lm(Gas ~ Insul + Temp, whiteside) anova(gasPR, gasBA) oldcon <- options(contrasts = c("contr.treatment", "contr.poly")) gasBA1 <- lm(Gas ~ Insul*Temp, whiteside) summary(gasBA1)$coef options(oldcon) # 6.2 Model formulae and model matrices dat <- data.frame(a = factor(rep(1:3, 3)), y = rnorm(9, rep(2:4, 3), 0.1)) obj <- lm(y ~ a, dat) alf.star <- coef(obj) alf.star Ca <- contrasts(dat$a) # contrast matrix for `a' alf <- drop(Ca %*% alf.star[-1]) alf dummy.coef(obj) N <- factor(Nlevs <- c(0,1,2,4)) contrasts(N) contrasts(ordered(N)) N2 <- N contrasts(N2, 2) <- poly(Nlevs, 2) N2 <- C(N, poly(Nlevs, 2), 2) # alternative contrasts(N2) fractions(ginv(contr.helmert(n = 4))) Cp <- diag(-1, 4, 5); Cp[row(Cp) == col(Cp) - 1] <- 1 Cp fractions(ginv(Cp)) # 6.3 Regression diagnostics data(hills) hills.lm <- lm(time ~ dist + climb, hills) hills.lm frame() par(fig=c(0, 0.6, 0, 0.55)) plot(fitted(hills.lm), studres(hills.lm)) abline(h=0, lty=2) # identify(fitted(hills.lm), studres(hills.lm), row.names(hills)) par(fig=c(0.6, 1, 0, 0.55), pty="s") qqnorm(studres(hills.lm)) qqline(studres(hills.lm)) par(pty="m") hills.hat <- lm.influence(hills.lm)$hat cbind(hills, lev=hills.hat)[hills.hat > 3/35, ] cbind(hills, pred=predict(hills.lm))["Knock Hill", ] hills1.lm <- lm(time ~ dist + climb, hills[-18, ]) hills1.lm lm(time ~ dist + climb, hills[-c(7,18), ]) summary(hills1.lm) summary(lm(time ~ dist + climb, hills[-18, ], weight=1/dist^2)) lm(time ~ -1 + dist + climb, hills[-18, ], weight=1/dist^2) #hills <- hills # make a local copy (needed in S-PLUS >= 5) hills$ispeed <- hills$time/hills$dist hills$grad <- hills$climb/hills$dist hills2.lm <- lm(ispeed ~ grad, hills[-18, ]) hills2.lm frame() par(fig=c(0, 0.6, 0, 0.55)) plot(hills$grad[-18], studres(hills2.lm), xlab="grad") abline(h=0, lty=2) # identify(hills$grad[-18], studres(hills2.lm), row.names(hills)[-18]) par(fig=c(0.6, 1, 0, 0.55), pty="s") qqnorm(studres(hills2.lm)) qqline(studres(hills2.lm)) par(pty="m") hills2.hat <- lm.influence(hills2.lm)$hat cbind(hills[-18,], lev=hills2.hat)[hills2.hat > 1.8*2/34, ] # 6.4 Safe prediction data(wtloss) quad1 <- lm(Weight ~ Days + I(Days^2), wtloss) quad2 <- lm(Weight ~ poly(Days, 2), wtloss) new.x <- data.frame(Days = seq(250, 300, 10), row.names=seq(250, 300, 10)) predict(quad1, newdata=new.x) predict(quad2, newdata=new.x) #predict.gam(quad2, newdata=new.x) # R does not have predict.gam new.white <- data.frame(Temp = 5*(0:2), Insul = factor(rep("After", 3))) predict(gasBA, new.white) # gives error in earlier S new.white <- data.frame(Temp = 5*(0:2), Insul = factor(rep("After", 3), levels=c("After", "Before"))) predict(gasBA, new.white) # wrong in earlier S # 6.5 Robust and resistant regression #haveLMR <- (version$major >= 5) | (version$major == 4 && version$minor >=5) library(lqs) data(phones) phones.lm <- lm(calls ~ year, phones) attach(phones); plot(year, calls); detach() abline(phones.lm$coef) abline(rlm(calls ~ year, phones, maxit=50), lty=2, col=2) abline(lqs(calls ~ year, phones), lty=3, col=3) #legend(locator(1), legend=c("least squares", "M-estimate", "LTS", lty=1:3, col=1:3) summary(lm(calls ~ year, data=phones), cor=F) summary(rlm(calls ~ year, maxit=50, data=phones), cor=F) summary(rlm(calls ~ year, scale.est="proposal 2", data=phones), cor=F) summary(rlm(calls ~ year, data=phones, psi=psi.bisquare), cor=F) lqs(calls ~ year, data=phones) lqs(calls ~ year, data=phones, method="lms") lqs(calls ~ year, data=phones, method="S") summary(rlm(calls ~ year, data=phones, method="MM"), cor=F) #if(haveLMR) { # phones.lmr <- lmRobMM(calls ~ year, data=phones) # print(summary(phones.lmr)) #} hills.lm hills1.lm # omitting Knock Hill rlm(time ~ dist + climb, hills) summary(rlm(time ~ dist + climb, hills, weights=1/dist^2, method="MM"), cor=F) lqs(time ~ dist + climb, data=hills, nsamp="exact") summary(hills2.lm) # omitting Knock Hill summary(rlm(ispeed ~ grad, hills), cor=F) summary(rlm(ispeed ~ grad, hills, method="MM"), cor=F) #if(haveLMR) # summary(lmRobMM(ispeed ~ grad, data=hills)) lqs(ispeed ~ grad, data=hills) # 6.6 Bootstrapping linear models library(boot) fit <- lm(calls ~ year, data=phones) ph <- data.frame(phones, res=resid(fit), fitted=fitted(fit)) ph.fun <- function(data, i) { d <- data d$calls <- d$fitted + d$res[i] coef(update(fit, data=d)) } ph.lm.boot <- boot(ph, ph.fun, R=499) ph.lm.boot fit <- rlm(calls ~ year, method="MM", data=phones) ph <- data.frame(phones, res=resid(fit), fitted=fitted(fit)) # next two lines too slow for testing S-PLUS #ph.rlm.boot <- boot(ph, ph.fun, R=499) #ph.rlm.boot # 6.7 Factorial designs and designed experiments options(contrasts=c("contr.helmert", "contr.poly")) data(npk) npk.aov <- aov(yield ~ block + N*P*K, npk) npk.aov summary(npk.aov) alias(npk.aov) coef(npk.aov) options(contrasts=c("contr.treatment", "contr.poly")) npk.aov1 <- aov(yield ~ block + N + K, npk) summary.lm(npk.aov1) se.contrast(npk.aov1, list(N=="0", N=="1"), data=npk) model.tables(npk.aov1, type="means", se=T) mp <- c("-","+") NPK <- expand.grid(N=mp, P=mp, K=mp) NPK if(F) { blocks13 <- fac.design(levels=c(2,2,2), factor=list(N=mp, P=mp, K=mp), rep=3, fraction=1/2) blocks46 <- fac.design(levels=c(2,2,2), factor=list(N=mp, P=mp, K=mp), rep=3, fraction=~ -N:P:K) NPK <- design(block = factor(rep(1:6, rep(4,6))), rbind(blocks13, blocks46)) i <- order(runif(6)[NPK$block], runif(24)) NPK <- NPK[i,] # Randomized lev <- rep(2,7) factors <- list(S=mp, D=mp, H=mp, G=mp, R=mp, B=mp, P=mp) Bike <- fac.design(lev, factors, fraction = ~ S:D:G + S:H:R + D:H:B + S:D:H:P) Bike replications(~ .^2, data=Bike) } # 6.8 An unbalanced four-way layout data(quine) attach(quine) table(Lrn, Age, Sex, Eth) Means <- tapply(Days, list(Eth, Sex, Age, Lrn), mean) Vars <- tapply(Days, list(Eth, Sex, Age, Lrn), var) SD <- sqrt(Vars) par(mfrow=c(1,2), pty="s") plot(Means, Vars, xlab="Cell Means", ylab="Cell Variances") plot(Means, SD, xlab="Cell Means", ylab="Cell Std Devn.") detach() boxcox(Days+1 ~ Eth*Sex*Age*Lrn, data = quine, singular.ok = T, lambda = seq(-0.05, 0.45, len = 20)) logtrans(Days ~ Age*Sex*Eth*Lrn, data = quine, alpha = seq(0.75, 6.5, len=20), singular.ok = T) quine.hi <- aov(log(Days + 2.5) ~ .^4, quine) quine.nxt <- update(quine.hi, . ~ . - Eth:Sex:Age:Lrn) dropterm(quine.nxt, test="F") quine.lo <- aov(log(Days+2.5) ~ 1, quine) addterm(quine.lo, quine.hi, test="F") quine.stp <- stepAIC(quine.nxt, scope = list(upper = ~Eth*Sex*Age*Lrn, lower = ~1), trace = F) quine.stp$anova dropterm(quine.stp, test="F") quine.3 <- update(quine.stp, . ~ . - Eth:Age:Lrn) dropterm(quine.3, test="F") quine.4 <- update(quine.3, . ~ . - Eth:Age) dropterm(quine.4, test="F") quine.5 <- update(quine.4, . ~ . - Age:Lrn) dropterm(quine.5, test="F") # 6.9 Predicting computer performance data(cpus) boxcox(perf ~ syct+mmin+mmax+cach+chmin+chmax, data=cpus, lambda=seq(0, 1, 0.1)) cpus1 <- cpus attach(cpus) for(v in names(cpus)[2:7]) cpus1[[v]] <- cut(cpus[[v]], unique(quantile(cpus[[v]])), include.lowest = T) detach() boxcox(perf ~ syct+mmin+mmax+cach+chmin+chmax, data = cpus1, lambda = seq(-0.25, 1, 0.1)) set.seed(123) cpus0 <- cpus1[, 2:8] # excludes names, authors' predictions cpus.samp <- sample(1:209, 100) cpus.lm <- lm(log10(perf) ~ ., data=cpus1[cpus.samp,2:8]) test.cpus <- function(fit) sqrt(sum((log10(cpus0[-cpus.samp, "perf"]) - predict(fit, cpus0[-cpus.samp,]))^2)/109) test.cpus(cpus.lm) cpus.lm2 <- stepAIC(cpus.lm, trace=F) cpus.lm2$anova test.cpus(cpus.lm2) res1 <- log10(cpus1[-cpus.samp, "perf"]) - predict(cpus.lm, cpus0[-cpus.samp,]) res2 <- log10(cpus1[-cpus.samp, "perf"]) - predict(cpus.lm2, cpus0[-cpus.samp,]) library(ctest) wilcox.test(res1^2, res2^2, paired=T, alternative="greater") # 6.10 Multiple comparisons data(immer) immer.aov <- aov((Y1+Y2)/2 ~ Var + Loc, data=immer) summary(immer.aov) model.tables(immer.aov, type="means", se=T, cterms="Var") if(F) { print(multicomp(immer.aov, plot=T)) oats1 <- aov(Y ~ N + V + B, data=oats) print(summary(oats1)) print(multicomp(oats1, focus="V")) lmat <- matrix(c(0,-1,1,rep(0, 11), 0,0,-1,1, rep(0,10), 0,0,0,-1,1,rep(0,9)),,3, dimnames=list(NULL, c("0.2cwt-0.0cwt", "0.4cwt-0.2cwt", "0.6cwt-0.4cwt"))) print(multicomp(oats1, lmat=lmat, bounds="lower", comparisons="none")) } # 6.11 Random and mixed effects if(F) { summary(raov(Conc ~ Lab/Bat, data = coop, subset = Spc=="S1")) #coop <- coop # make a local copy (needed in S-PLUS >= 5) is.random(coop) <- T is.random(coop$Spc) <- F is.random(coop) varcomp(Conc ~ Lab/Bat, data=coop, subset = Spc=="S1") varcomp(Conc ~ Lab/Bat, data=coop, subset = Spc=="S1", method = c("winsor", "minque0")) } data(oats) #oats <- oats # make a local copy: needed in S-PLUS >= 5 oats$Nf <- ordered(oats$N, levels=sort(levels(oats$N))) oats.aov <- aov(Y ~ Nf*V + Error(B/V), data = oats, qr = T) summary(oats.aov) #summary(oats.aov, split=list(Nf=list(L=1, Dev=2:3))) par(mfrow=c(1,2), pty="s") plot(fitted(oats.aov[[4]]), studres(oats.aov[[4]])) abline(h=0, lty=2) oats.pr <- proj(oats.aov) qqnorm(oats.pr[[4]][,"Residuals"], ylab="Stratum 4 residuals") qqline(oats.pr[[4]][,"Residuals"]) par(mfrow=c(1,1), pty="m") oats.aov2 <- aov(Y ~ N + V + Error(B/V), data = oats, qr = T) model.tables(oats.aov2, type = "means", se = T) if(F){ is.random(oats$B) <- T varcomp(Y ~ N + V + B/V, data = oats) xyplot(Y ~ EP | No, data = petrol, xlab = "ASTM end point (deg. F)", ylab = "Yield as a percent of crude", panel = function(x, y) { m <- sort.list(x) panel.grid() panel.xyplot(x[m], y[m], type = "b", cex = 0.5) }) } data(petrol) Petrol <- petrol names(Petrol) Petrol[, 2:5] <- scale(as.matrix(Petrol[, 2:5]), scale = F) pet1.lm <- lm(Y ~ No/EP - 1, Petrol) matrix(round(coef(pet1.lm),2), 2, 10, byrow = T, dimnames = list(c("b0","b1"),levels(Petrol$No))) pet2.lm <- lm(Y ~ No - 1 + EP, Petrol) anova(pet2.lm, pet1.lm) pet3.lm <- lm(Y ~ SG + VP + V10 + EP, Petrol) anova(pet3.lm, pet2.lm) library(nlme) pet3.lme <- lme(Y ~ SG + VP + V10 + EP, random = ~ 1 | No, data = Petrol) summary(pet3.lme) pet3.lme <- update(pet3.lme, method = "ML") summary(pet3.lme) pet4.lme <- update(pet3.lme, fixed = Y ~ V10 + EP) anova(pet4.lme, pet3.lme) coef(pet4.lme) pet5.lme <- update(pet4.lme, random = ~ 1 + EP | No) anova(pet4.lme, pet5.lme) data(coop) lme(Conc ~ 1, random = ~1 | Lab/Bat, data = coop, subset = Spc=="S1") options(contrasts = c("contr.treatment", "contr.poly")) oats.lme <- lme(Y ~ N + V, random = ~1 | B/V, data = oats) summary(oats.lme) data(Sitka) sitka.lme <- lme(size ~ treat*ordered(Time), random = ~1 | tree, data=Sitka) summary(sitka.lme) Sitka <- Sitka attach(Sitka) Sitka$treatslope <- Time * (treat=="ozone") detach() sitka.lme2 <- update(sitka.lme, fixed = size ~ ordered(Time) + treat + treatslope) summary(sitka.lme2) fitted(sitka.lme2, level=0)[1:5] fitted(sitka.lme2, level=0)[301:305] sitka.lme <- lme(size ~ treat*ordered(Time), random = ~1 | tree, data = Sitka, corr = corCAR1(, ~Time | tree)) summary(sitka.lme) # End of ch06