est.dispersion <- function(obs, est, distname) { # est.dispersion # Estimates the dispersion for a set of past data where actual and # estimated efforts are known. # "obs" is a vector of observed efforts taken from past data. # "est" is the corresponding vector of estimated efforts: # these represent the MEDIANS of the distributions for distname=lnorm.med, # and the MEANS of the distributions for all other distnames. # # This function is most appropriate when the estimated efforts were # obtained by guesswork before the projects started. If they were # derived as the output of a regression model, then dispersion would # normally be estimated as part of the model, rather than estimating # it separately using this function. # # "distname" is one of "norm", "gamma", "lnorm", or "lnorm.med"; it must be supplied # as a character string. # For distname="norm", dispersion = variance = sigma^2. # For distname="gamma" and "lnorm", dispersion = variance/(mean^2). # For distname="lnorm.med", dispersion = variance/(median^2). # # # EXAMPLE: # a set of past projects might be stored in data frame "pastdata" # with columns "observed" and "estimated": then to estimate the # dispersion for the Gamma distribution: # est.dispersion(obs=pastdata$observed, est=pastdata$estimated, "gamma") # ddist.func <- function(x, distname, meanvec, dispersion) { switch(distname, norm = dnorm(x, mean = meanvec, sd = sqrt(dispersion)), gamma = dgamma(x, shape = 1/dispersion, scale = dispersion * meanvec), lnorm = dlnorm(x, meanlog = log(meanvec/sqrt(1 + dispersion)), sdlog = sqrt(log(1 + dispersion))), lnorm.med = dlnorm(x, meanlog = log(meanvec), sdlog = sqrt(log((1 + sqrt(1 + 4 * dispersion))/2))), stop("invalid probability distribution specified. \nUse distname= \"norm\", \"gamma\", \"lnorm\", or \"lnorm.med\".")) } meanvec <- est # # (called meanvec although it will be medians for lnorm.med distribution). # nllike.func is the negative log-likelihood when mean(or median)=est, # observations=obs, and dispersion is known: nllike.func <- function(disp){ # add a line to bypass dgamma if distname=gamma, # because R on Windows will crash if the dispersion # hits a negative value during the optimization if((distname == "gamma")&(disp <= 0)) nllike <- NaN else nllike <- sum(-log(ddist.func(obs, distname, meanvec, disp))) nllike } # # choose a starting value for dispersion for the minimization: # for the Normal distribution this is just the variance; # for the Gamma and Lognormal distributions it is based on the idea that # the maximum difference between estimate and observed might be about 2 sd's, # so dispersion estimate would be sd^2/mean^2 and this might therefore be # about 1/4 * max((estimate-observed)^2/estimate^2); # this value is also used as a crude guess for the Lognormal-median distribution. start.disp <- switch(distname, norm = mean((obs-meanvec)^2), gamma = max((1-obs/meanvec)^2)/4, lnorm = max((1-obs/meanvec)^2)/4, lnorm.med = max((1-obs/meanvec)^2)/4, stop("invalid probability distribution specified. \nUse distname= \"norm\", \"gamma\", \"lnorm\", or \"lnorm.med\".")) # switch off warning messages just for the minimization: options(warn=-1) result <- nlm(nllike.func, start.disp, iterlim=1000) cat("\n\nStart value for minimization: ", start.disp, "\n\n") cat("Minimization result:\n") print(result) # switch warning messages back on: options(warn=0) disp.est <- c(disp.est=result$estimate) cat("Estimated dispersion: ", disp.est,"\n") disp.est }