setdates <- function(projdat, E = 4, Ntimes = 10000, distname, dispersion=NULL,
	 psuccess = 0.9, priority = NULL)
{
# setdates
# Sets finish dates for the non-committed projects in the project portfolio so
# that the overall probability of portfolio success is psuccess (if possible).
# First of all, the committed deadlines are examined to find the failure 
# probability on these deadlines only: e.g. might have 5 projects with 
# the 3rd one committed, and 10% failure to meet the third deadline alone.
# If this still leaves some allowed failure to divide between the 
# non-committed projects, (i.e. if psuccess < 0.9 in the example), then 
# p.firstfail is divided equally among the non-committed projects
# so that the overall success probability is psuccess.
# Note, however, that p.firstfail is divided equally only using those 
# trials that *succeeded* on all the committed deadlines (e.g. in the example,
# only on the 90% of trials that succeeded on deadline 3).  This means that,
# when the SETDATES deadlines are used in PROJMAN, it will NOT (generally) 
# indicate equal pc.firstfail among the uncommitted projects, and will also
# (generally) give different p.firstfail for the committed projects from 
# SETDATES.  This happens because some of the firstfail that SETDATES 
# attributes to the committed projects will later be swept up by the uncommitted 
# projects, once they have fixed deadlines as required by PROJMAN. 
#
# For SETDATES to run, it must be given a project priority (order of finish time). 
# Different priorities give different results.  
# The default (if the "priority" argument is left NULL) is to order the 
# projects so that all committed projects are to be finished first (in order of 
# their fixed deadlines), then all non-committed projects are to be finished in 
# increasing order of expected effort, e_i.
# Alternatively, the user can specify a priority using the argument "priority".
# This is a vector: for example, priority=c(3, 1, 2, 4), which indicates that 
# project 3 should be finished first, project 1 should be finished second, and
# so on.  An obvious choice would be to use priority=order(projdat$finish),
# so that new deadlines are determined for the projects in "projdat" but the
# order in which the projects are to be finished is the same as originally
# specified in the "projdat" dataframe.
#
# The priority argument must be a permutation of the project numbers 
# 1,2,3,..,nproj.
#
# Project details are contained in the dataframe projdat, with fields
#       name     (name)
#       needs    (estimated (mean) effort required, in person-days)
#       finish   (finish date: number of days from now, proposed or actual)
#       sofar    (effort expended so far, in person-days)
#       commit   (flag y/n indicating whether the finish date is a firm
#                       commitment or not)
# 
# Given E employees, the function simulates Ntimes sets of effort times from the
# distributions specified.  
# The finish-on-time criterion is (cumsum(Y) <= E*finish).
# 
# Y_i is the effort taken for project i (in person-days), and it is assumed to be
# distributed according to distribution distname, which is one of
# "norm", "pois", "chisq", "gamma", "lnorm", and "lnorm.med"; the mean/median of the 
# distribution is needs[i] and the dispersion is given by argument "dispersion", 
# which is either of length 1 (same dispersion for all projects)
# or of length nproj (each project with different dispersion, although this
# wouldn't be too common).
# For distribution lnorm.med, needs[i] represents the MEDIAN; for all other
# distributions, needs[i] represents the MEAN.  
#
# For distname="pois" or "chisq", dispersion should be NULL.
# For distname="norm", dispersion = variance = sigma^2.
# For distname="gamma", dispersion = variance/(mean^2).
# For distname="lnorm", dispersion = variance/(mean^2).
# For distname="lnorm.med", dispersion = variance/(median^2)
#         				 = exp(sdlog^2)*(exp(sdlog^2)-1).
#
#
# EXAMPLE: 
# find a set of finish dates for "portfolio" data set, with P(success)=0.90, 
# then test it in projman to check it's about right.  
# e.g. using chi-squared distribution:
# dates <- setdates(portfolio,E=4,Ntimes=10000,distname="chisq",psuccess=0.90) 
# projman(portfolio,E=4,Ntimes=10000,distname="chisq",newfinish=dates)
#
# or using gamma distribution with dispersion 0.4:
# dates <- setdates(portfolio, E=4, Ntimes=10000,
#                        distname="gamma", dispersion=0.4, psuccess=0.90) 
# projman(portfolio, E=4, Ntimes=10000, distname="gamma", dispersion=0.4,
#                        newfinish=dates)
# 
#
# first some housekeeping:
	nproj <- length(projdat$needs)
	if(!is.numeric(dispersion))
		switch(distname,
			gamma = stop(
				"\ndispersion argument needed for\ndistname=gamma."
				),
			norm = stop(
				"\ndispersion argument needed for distname=norm."
				),
			lnorm = stop(
				"\ndispersion argument needed for distname=lnorm."
				),
			lnorm.med = stop(
				"\ndispersion argument needed for distname=lnorm.med."
				),
			NULL)
	if((!is.null(dispersion)) & (length(dispersion) != 1.) & (length(
		dispersion) != nproj))
		stop("\ndispersion vector should be length 1 or nproj\n")
	if((!is.null(priority)) & (length(priority) != nproj))
		stop("\npriority vector must be a permutation of 1, 2, ..., nproj\n"
			)
        if((!is.null(priority)))
                if(sum((sort(priority) - (1:nproj))^2) > 0)
                	stop("\npriority vector must be a permutation of 1, 2, ..., nproj\n"
                        )
	#
	# Define functions rdist.func, qdist.func, and pdist.func, which generate
	# random samples, quantiles, and probabilities from the distribution
	# distname given mean vector meanvec and dispersion vector dispvec:
	if(missing(distname)) stop(
			"distname argument required. \nUse distname=\"pois\",\n\"norm\", \"chisq\", \"gamma\",  \"lnorm\", or \"lnorm.med\"."
			)
	rdist.func <- function(ss, distname, meanvec, dispvec)
	{
		switch(distname,
			norm = rnorm(ss, mean = meanvec, sd = sqrt(dispvec)),
			pois = rpois(ss, lambda = meanvec),
			chisq = rchisq(ss, df = meanvec),
			gamma = rgamma(ss, shape = 1/dispvec, scale=dispvec * meanvec),
			lnorm = rlnorm(ss, meanlog = log(meanvec/sqrt(1 + 
				dispvec)), sdlog = sqrt(log(1 + dispvec))),
			lnorm.med = rlnorm(ss, meanlog = log(meanvec),
				sdlog = sqrt(log((1 + sqrt(1 + 4 * dispvec))/2))),
			stop("invalid probability distribution specified. \nUse distname=\"pois\", \"norm\", \"chisq\", \"gamma\", \"lnorm\", or \"lnorm.med\"."))
	}
	qdist.func <- function(p, distname, meanvec, dispvec)
	{
		switch(distname,
			norm = qnorm(p, mean = meanvec, sd = sqrt(dispvec)),
			pois = qpois(p, lambda = meanvec),
			chisq = qchisq(p, df = meanvec),
			gamma = qgamma(p, shape = 1/dispvec, scale = dispvec * meanvec),
			lnorm = qlnorm(p, meanlog = log(meanvec/sqrt(1 + 
				dispvec)), sdlog = sqrt(log(1 + dispvec))),
			lnorm.med = qlnorm(p, meanlog = log(meanvec),
				sdlog = sqrt(log((1 + sqrt(1 + 4 * dispvec))/2))),
			stop("invalid probability distribution specified. \nUse distname=\"pois\", \"norm\", \"chisq\", \"gamma\", \"lnorm\", or \"lnorm.med\"."))
	}
	pdist.func <- function(q, distname, meanvec, dispvec)
	{
		switch(distname,
			norm = pnorm(q, mean = meanvec, sd = sqrt(dispvec)),
			pois = ppois(q, lambda = meanvec),
			chisq = pchisq(q, df = meanvec),
			gamma = pgamma(q, shape = 1/dispvec, scale = dispvec * meanvec),
			lnorm = plnorm(q, meanlog = log(meanvec/sqrt(1 + 
				dispvec)), sdlog = sqrt(log(1 + dispvec))),
			lnorm.med = plnorm(q, meanlog = log(meanvec),
				sdlog = sqrt(log((1 + sqrt(1 + 4 * dispvec))/2))),
			stop("invalid probability distribution specified. \nUse distname=\"pois\", \"norm\", \"chisq\", \"gamma\", \"lnorm\", or \"lnorm.med\"."))			
	}
	#
	# housekeeping with committed and uncommitted projects.
	# note that "committed.orig" is the indices of the ORIGINAL
	# data frame that are committed, and doesn't apply after
	# the projects have been reordered.  
	# The program exits if the committed deadlines are not 
	# consistent with those specified in priority: i.e.
	# needs 
	# order(projdat$finish[committed.orig])=order(ranks[committed.orig])
	# where ranks[i]=3 if project i has the 3rd shortest deadline, etc;
	# That is, the project with the shortest committed deadline appears
	# as the first rank among the committed projects, and so on.
	committed.orig <- (1.:nproj)[projdat$commit == "y"]
	ncommitted <- length(committed.orig)
	if(ncommitted > 0)
		mutable.orig <- (1:nproj)[ - committed.orig]
	else mutable.orig <- (1:nproj)
	nmutable <- length(mutable.orig)
	if(!is.null(priority)){
		ranks <- order(priority)
		if(sum((order(projdat$finish[committed.orig]) - order(ranks[
			committed.orig]))^2) > 0)
			stop("\nspecified priority vector is inconsistent with known commitments.\n"
			)
	}
	# 
	# Now put the projects into deadline order.  
	# If the "priority" vector is not specified, the default ordering
	# is to order the committed projects according to their finish date
	# and the uncommitted (mutable) projects according to their expected 
	# effort.
	# Otherwise, the vector ord is ord=priority.
	if(is.null(priority)) {
		com.order <- committed.orig[order(projdat$finish[committed.orig
			])]
		mutable.order <- mutable.orig[order(projdat$needs[mutable.orig]
			)]
		ord <- c(com.order, mutable.order)
	}
	else ord <- priority
	#
	# Apply the new ordering to all data fields:
	# the new vector "committed" refers to the ordered projects
	# (unlike committed.orig).
	name <- projdat$name[ord]
	needs <- projdat$needs[ord]
	sofar <- projdat$sofar[ord]
	commit <- projdat$commit[ord]
	finish <- projdat$finish[ord]
	committed <- (1:nproj)[commit == "y"]
	mutable <- (1:nproj)[commit == "n"]
	#
	if(length(dispersion) > 1.) dispersion <- dispersion[ord]
	cat("\nOrdered projects: \n")
	# Ensure that the dispersion vector has length nproj:
	#
	print(as.character(name), quote=F)
	if(length(dispersion) == 1.)
		dispersion <- rep(dispersion, nproj)
	# First generate times from unconditional probability distributions for the
	# projects not yet started:   
	ymat <- matrix(ncol = Ntimes, nrow = nproj)
	nunstarted <- length(sofar[sofar == 0])
	# Now generate times from the conditional probability distribution for the
	# projects already started (those with sofar>0):
	# need to subtract off the amount done so far (sofar) before inserting into
	# ymat.
	if(nunstarted > 0) ymat[sofar == 0,  ] <- rdist.func(Ntimes * 
			nunstarted, distname, meanvec = needs[sofar == 0],
			dispvec = dispersion[sofar == 0])
	nstarted <- length(sofar[sofar > 0])
	#
	if(nstarted > 0) {
		unif <- runif(Ntimes * nstarted)
		pdist.t <- pdist.func(sofar[sofar > 0], distname, meanvec = 
			needs[sofar > 0], dispvec = dispersion[sofar > 0])
		ymat[sofar > 0,  ] <- qdist.func((1 - pdist.t) * unif + 
			pdist.t, distname, meanvec = needs[sofar > 0], dispvec
			 = dispersion[sofar > 0]) - sofar[sofar > 0]
	}
	# Now find p.firstfail for the committed projects.
	# first.fail is a vector storing, for each of the Ntimes trials, the
	# first of the committed projects to fail. If this is NA then all 
	# committed projects succeeded.   
	cumsum.ymat <- apply(ymat, 2., cumsum)
	if(ncommitted == 1) {
		# want first.fail to be NA if the committed project
		# succeeded, or 1 otherwise
		first.fail <- rep(NA, Ntimes)
		first.fail[cumsum.ymat[committed,  ] > E * finish[committed]] <- 1
		# "1" means that first (and only) committed project failed
		p.firstfail <- length(na.omit(first.fail))/Ntimes
		cat("p.firstfail: ", p.firstfail, "\n")
	}
	else if(ncommitted > 1.) {
		first.fail <- apply(cumsum.ymat[committed,  ], 2., function(x)
		{
			# (e.g. returns 2 if the committed project 
			# with 2nd shortest deadline was the first to fail)
			# 
			min(c(nproj+10, (1:ncommitted)[x > E * finish[committed]]))
		}
		)
  	        first.fail[first.fail > nproj] <- NA  
		# note the treatment of min to deal with differences between Splus 
		# and R.
		# 
		overall.failures <- na.omit(first.fail)
		p.firstfail <- tabulate(overall.failures, nbins = ncommitted)/
			Ntimes
		cat("p.firstfail: ", p.firstfail, "\n")
	}
	else {
		first.fail <- rep(NA, Ntimes)
		p.firstfail <- 0
	}
	fail.ok <- (1 - psuccess - sum(p.firstfail))/nmutable
	if(fail.ok < 0)
		cat("\n\nSorry: specified failure rate is already exceeded by commitments. \nBest success rate is ",
			format(100 * (1 - sum(p.firstfail))), "%\n\n")
	#
	# Now use cumsum.ymat to find the suggested finish dates for each mutable
	# project: fail.ok is the allowed probability of first-fail for each one. 
	# Do the projects one at a time, store the results in finish.res.
	# Note that the fail.ok quantile must be divided by E for the number of
	# days necessary.
	# The vector rep.in contains the indices of cumsum.ymat that are "in"
	# at each stage: only those replicates who have "passed" all of projects
	# 1 to i-1 go through to project i.
	#
	fail.ok <- max(c(fail.ok, 0))
	finish.res <- numeric(nproj)
	if(ncommitted > 0)
		finish.res[committed] <- finish[committed]
	# point.ok is the number of the longest-taking projects that will be discarded 
	# to find the finish time at which fail.ok projects fail.  
	point.ok <- round(Ntimes * fail.ok)
	if(nmutable > 0.) {
		# 
		# the uncommitted projects are indexed by "mutable":
		# so the i'th project is mutable[i].
		cat("\n\n fail.ok (%): ", format(100. * fail.ok), "\n\n")
		rep.in <- (1:Ntimes)[is.na(first.fail)]
		# rep.in are those of the Ntimes simulations that 
		# succeeded on all the committed projects.  
		for(i in 1:nmutable) {
			proj.i <- mutable[i]
			n.in <- length(rep.in)
			# (this changes as i increments)
			cumsum.ind.i <- rep.in[order(cumsum.ymat[proj.i, rep.in
				])]
			finish.res[proj.i] <- cumsum.ymat[proj.i, cumsum.ind.i[
				n.in - point.ok]]/E
			rep.in <- cumsum.ind.i[1:(n.in - point.ok)]
		}
	}
	#
	# get finish.res back into order of original dataset to return:
	names(finish.res) <- levels(name)[name]
	finish.res[order(ord)]
}