est.dispersion <- function(obs, est, distname)
{
# est.dispersion
# Estimates the dispersion for a set of past data where actual and
# estimated efforts are known.
# "obs" is a vector of observed efforts taken from past data.
# "est" is the corresponding vector of estimated efforts:
# these represent the MEDIANS of the distributions for distname=lnorm.med,
# and the MEANS of the distributions for all other distnames.
#
# This function is most appropriate when the estimated efforts were
# obtained by guesswork before the projects started. If they were
# derived as the output of a regression model, then dispersion would
# normally be estimated as part of the model, rather than estimating
# it separately using this function.
#
# "distname" is one of "norm", "gamma", "lnorm", or "lnorm.med"; it must be supplied
# as a character string.
# For distname="norm", dispersion = variance = sigma^2.
# For distname="gamma" and "lnorm", dispersion = variance/(mean^2).
# For distname="lnorm.med", dispersion = variance/(median^2).
#
#
# EXAMPLE:
# a set of past projects might be stored in data frame "pastdata"
# with columns "observed" and "estimated": then to estimate the
# dispersion for the Gamma distribution:
# est.dispersion(obs=pastdata$observed, est=pastdata$estimated, "gamma")
#
ddist.func <- function(x, distname, meanvec, dispersion)
{
switch(distname,
norm = dnorm(x, mean = meanvec, sd = sqrt(dispersion)),
gamma = dgamma(x, shape = 1/dispersion, scale =
dispersion * meanvec),
lnorm = dlnorm(x, meanlog = log(meanvec/sqrt(1 +
dispersion)), sdlog = sqrt(log(1 + dispersion))),
lnorm.med = dlnorm(x, meanlog = log(meanvec),
sdlog = sqrt(log((1 + sqrt(1 + 4 * dispersion))/2))),
stop("invalid probability distribution specified. \nUse distname= \"norm\", \"gamma\", \"lnorm\", or \"lnorm.med\"."))
}
meanvec <- est
#
# (called meanvec although it will be medians for lnorm.med distribution).
# nllike.func is the negative log-likelihood when mean(or median)=est,
# observations=obs, and dispersion is known:
nllike.func <- function(disp){
# add a line to bypass dgamma if distname=gamma,
# because R on Windows will crash if the dispersion
# hits a negative value during the optimization
if((distname == "gamma")&(disp <= 0)) nllike <- NaN
else nllike <- sum(-log(ddist.func(obs, distname, meanvec, disp)))
nllike
}
#
# choose a starting value for dispersion for the minimization:
# for the Normal distribution this is just the variance;
# for the Gamma and Lognormal distributions it is based on the idea that
# the maximum difference between estimate and observed might be about 2 sd's,
# so dispersion estimate would be sd^2/mean^2 and this might therefore be
# about 1/4 * max((estimate-observed)^2/estimate^2);
# this value is also used as a crude guess for the Lognormal-median distribution.
start.disp <- switch(distname,
norm = mean((obs-meanvec)^2),
gamma = max((1-obs/meanvec)^2)/4,
lnorm = max((1-obs/meanvec)^2)/4,
lnorm.med = max((1-obs/meanvec)^2)/4,
stop("invalid probability distribution specified. \nUse distname= \"norm\", \"gamma\", \"lnorm\", or \"lnorm.med\"."))
# switch off warning messages just for the minimization:
options(warn=-1)
result <- nlm(nllike.func, start.disp, iterlim=1000)
cat("\n\nStart value for minimization: ", start.disp, "\n\n")
cat("Minimization result:\n")
print(result)
# switch warning messages back on:
options(warn=0)
disp.est <- c(disp.est=result$estimate)
cat("Estimated dispersion: ", disp.est,"\n")
disp.est
}