# R code to accompany
# "Row-Column Interaction Models, with an R implementation" by
# Thomas W. Yee and Alfian F. Hadi,
# Computational Statistics, 2014 (in press).
# Most of the examples here come from Section 5.

# Last modified: 20140521.


# Note: future changes to \pkg{VGAM} and \pkg{VGAMdata} are possible.
# The latest version of these R packages are available at
# http://www.stat.auckland.ac.nz/~yee


# ======================================================================
# Set things up
# Version 0.9-4 of both packages are required.


library("VGAM")  # Modelling functions are here.


library("VGAMdata")  # Some data sets are here.


# Sweave options
ps.options(pointsize = 12)
options(width = 72, digits = 4)
options(SweaveHooks = list(fig = function() par(las = 1)))
options(prompt = "R> ", continue = "+  ")
options(continue = " ")


# ======================================================================
# GRC model fitted to alcoholic offences (alcoff)
# See Section 5.1 of Yee and Hadi (2014).


alcoff  # A 24 x 7 response matrix of counts


# Independence Poisson model
grc0.alcoff <- rcim(moffset(alcoff, "6", postfix = "*"),
                    rprefix = "Hour.24.", cprefix = "Day.")


# Plot the main effects
par(mfrow = c(1, 2), mar = c(4, 4.5, 0.9, 0.9) + 0.1)

plot(grc0.alcoff, rcol = "blue", ccol = "orange",
     rfirst = 14, cfirst = 1, rtype = "h", ctype = "h",
     lwd = 2, ylim = c(-1.5, 1.5),
     cylab = "Effective daily effects",
     rylab = "Hourly effects",
     rxlab = "Hour", cxlab = "Effective day")


# The estimated row and column effects may be extracted as follows.

round(coef(grc0.alcoff), dig = 2)
plot.grc0.alcoff <- plot(grc0.alcoff, which = NULL)
round(plot.grc0.alcoff@post$raw.col.effects, dig = 2)
round(plot.grc0.alcoff@post$col.effects, dig = 2)


class(grc0.alcoff)
summary(grc0.alcoff)


# ======================================================================
# Median polish fit to crash injuries, \code{crashi}.
# See Section 5.2 of Yee and Hadi (2014).
# Note that the asymmetric Laplace distribution is not naturally
# well-suited to estimation by scoring, hence this example is not
# fully reliable.


crashi  # crash injuries; a 24 x 7 matrix of counts


# Fit the median polish model

mp0.crashi <- rcim(crashi, alaplace2(tau = 0.5, intparloc = TRUE))


# Plot the main effects
par(mfrow = c(1, 2), mar = c(4.4, 4.3, 0.9, 1.9) + 0.1)

plot.mp0.crashi <-
  plot(mp0.crashi, lwd = 2, hlwd = 2,
       hcol = "purple", rcol = "blue", ccol = "darkorange",
       rtype = "p", ctype = "p", cylab = "Daily effects",
       rylab = "Hourly effects", rxlab = "Hour", cxlab = "Day")


# ======================================================================
# Quasi-variances example.
# Ships data, a simple Poisson regression.
# See Section 5.3 of Yee and Hadi (2014).


data("ships", package = "MASS")          # Get the 'ships' data from MASS
ships$Year <- as.factor(ships$year)      # A factor version of years
ships$Period <- as.factor(ships$period)  # A factor version of period


Shipmodel <- vglm(incidents ~ type + Year + Period,
                  quasipoissonff, data = ships,
                  subset = (service > 0), trace = TRUE,
                  offset = log(service))


fit1 <- rcim(Qvar(Shipmodel, "type"), uninormal("explink"), maxit = 101)
quasiVar <- qvar(fit1)  # Same as diag(predict(fit1)[, c(TRUE, FALSE)]) / 2
quasiSE <- sqrt(quasiVar)

quasiVar
quasiSE


# 5% LSD intervals (arrows) based on quasi-variances.

par(mfrow = c(1, 1), mar = c(4.1, 4.3, 0.1, 0.1) + 0.1)
plotqvar(fit1, scol = "blue", pch = 16,
          slwd = 1.5, las = 1, length.arrows = 0.07)


# ======================================================================
# Quasi-variances example.
# Zero-inflated Poisson (ZIP) example.
# xs.nz data subset, with babies as a response.
# See Section 5.4 of Yee and Hadi (2014).


# Select the subset

xs.nz.f <- subset(xs.nz, sex == "F")  # Only females
xs.nz.f <- subset(xs.nz.f, !is.na(babies)  &
                           !is.na(age)     &
                           !is.na(ethnic))
xs.nz.f <- xs.nz.f[xs.nz.f$babies != "-", ]  # Delete this one person
xs.nz.f$babies <- as.numeric(as.character(xs.nz.f$babies))
xs.nz.f <- subset(xs.nz.f, babies >=  0)
xs.nz.f <- subset(xs.nz.f, as.numeric(as.character(ethnic)) <=  2) 


# Not a Poisson, looks like a zero-inflated Poisson

with(xs.nz.f, barplot(table(babies), las = 1, ylab = "Frequency",
                      xlab = "Babies"))


# Fit a zero-inflated Poisson with smooth functions of age

clist <- list("bs(age, df = 4)"  = rbind(1, 0),
              "bs(age, df = 3)"  = rbind(0, 1),
              "ethnic"           = diag(2), 
              "(Intercept)"      = diag(2))
fit1.qvzip <- vglm(babies ~ bs(age, df = 4) + bs(age, df = 3) + ethnic,
                   zipoissonff(zero = NULL), data = xs.nz.f,
                   trace = TRUE,
                   constraints = clist)


# Compute and plot the Quasi-SE plots for a zero-inflated Poisson
# model fitted to a subset of females from \texttt{xs.nz} with babies
# as the response.

Fit1.qvzip <- rcim(Qvar(fit1.qvzip, "ethnic", which = 1),
                   uninormal("explink"), maxit = 99, trace = TRUE)
Fit2.qvzip <- rcim(Qvar(fit1.qvzip, "ethnic", which = 2),
                   uninormal("explink"), maxit = 99, trace = TRUE)


# Plot the quasi-SEs

par(mfrow = c(1, 2), mar = c(4, 4.5, 0.9, 0.9) + 0.1)

plotqvar(Fit1.qvzip, scol = "blue", pch = 16,
         main = expression(eta[1]),
         slwd = 1.5, las = 1, length.arrows = 0.07)
plotqvar(Fit2.qvzip, scol = "blue", pch = 16,
         main = expression(eta[2]),
         slwd = 1.5, las = 1, length.arrows = 0.07)


# ======================================================================
# Goodman's RC-association model (GRC model) and
# unconstrained quadratic ordination (UQO).
# This example shows how fitting a rank-1 GRC to Poisson counts is
# one method for performing a rank-1 UQO with species having
# equal tolerances.
# See Section 5.5 of Yee and Hadi (2014).
# The results easily generalize to higher dimensions.


set.seed(123)  # For reproducibility
n <- 100; p <- 5; S <- 5  # S is the number of species
pdata <- rcqo(n, p, S, es.opt = FALSE, eq.max = FALSE, eq.tol = FALSE,
              sd.tolerances = 1, sd.latvar = 0.75)  # Random X and Y
true.nu <- attr(pdata, "latvar")  # The 'truth'
Y <- pdata[, paste0("y", 1:S)]  # Y matrix (n x S)
uqo.rcim1 <- rcim(Y, Rank = 1,
                  str0 = NULL,  # Delta covers entire n x M matrix
                  iindex = 1:nrow(Y),  # RRR covers the entire Y
                  has.intercept = FALSE)  # Suppress the intercept


# Plot the results
par(mfrow = c(2, 2), mar = c(4.5, 4.5, 0.9, 0.9) + 0.1)

# Plot "(a)" for the optimums
plot(attr(pdata, "optima"), Coef(uqo.rcim1)@A,
     col = "blue", type = "p", main = "(a)")
mylm <- lm(Coef(uqo.rcim1)@A ~ attr(pdata, "optima"))
abline(coef = coef(mylm), col = "orange", lty = "dashed")

# Plot "(b)" for the site scores
fill.val <- NULL  # Choose this for the new parameterization
plot(attr(pdata, "latvar"), c(fill.val, concoef(uqo.rcim1)),
     las = 1, col = "blue", type = "p", main = "(b)")
mylm <- lm(c(fill.val, concoef(uqo.rcim1)) ~ attr(pdata, "latvar"))
abline(coef = coef(mylm), col = "orange", lty = "dashed")


# Plot unequal-tolerances CQOs.
# Plot "(c)" is the 'truth'.
# Plot "(d)" is for the UQO site scores and should be similar to (c).
myform <- attr(pdata, "formula")
p1ut <- cqo(myform, family = poissonff,
            eq.tol = FALSE, trace = FALSE, data = pdata)
c1ut <- cqo(cbind(y1, y2, y3, y4, y5) ~ scale(latvar(uqo.rcim1)),
            family = poissonff, eq.tol = FALSE, trace = FALSE,
            data = pdata)
lvplot(p1ut, lcol = 1:S, y = TRUE, pcol = 1:S, pch = 1:S, pcex = 0.5,
       main = "(c)")
lvplot(c1ut, lcol = 1:S, y = TRUE, pcol = 1:S, pch = 1:S, pcex = 0.5,
       main = "(d)")


# ======================================================================
# Rasch model example.
# Note: requires \pkg{VGAMdata} version 0.9-4 or later.


summary(exam1)  # The names of the students are in the first column


# Fit a simple Rasch model.
# First, remove all questions and people who were totally correct or wrong
Exam1.1 <- exam1  [, c(TRUE, colMeans(exam1  [, -1]) > 0)]
Exam1.1 <- Exam1.1[, c(TRUE, colMeans(Exam1.1[, -1]) < 1)]
Exam1.1 <- Exam1.1[rowMeans(Exam1.1[, -1]) > 0, ]
Exam1.1 <- Exam1.1[rowMeans(Exam1.1[, -1]) < 1, ]
rdata <- Exam1.1
Y.matrix <- rdata[, -1]  # Exclude the names of the students


# Fit the simple Rasch model.
# Argument 'mv' renamed to 'multiple.responses':
rfit <- rcim(Y.matrix, family = binomialff(multiple.responses = TRUE),
             trace = TRUE)


coef(rfit)  # Row and column effects
constraints(rfit, matrix = TRUE)  # Constraint matrices side-by-side
dim(model.matrix(rfit, type = "vlm"))  # 'Big' VLM matrix


# This plot shows the (main) row and column effects
par(mfrow = c(1, 2), las = 1, mar = c(4.5, 4.4, 2, 0.9) + 0.1)
rfit.extra <- plot(rfit, rcol = "blue", ccol = "orange",
                   cylab = "Item effects", rylab = "Person effects",
                   rxlab = "", cxlab = "")


names(rfit.extra@post)  # Some useful output put here
cbind(rfit.extra@post$row.effects)
cbind(rfit.extra@post$raw.row.effects)
round(cbind(-rfit.extra@post$col.effects), dig = 3)
round(cbind(-rfit.extra@post$raw.col.effects), dig = 3)
round(matrix(-rfit.extra@post$raw.col.effects, ncol = 1, # Rename for humans
             dimnames = list(colnames(Y.matrix), NULL)), dig = 3)


# ======================================================================