## This code template file contains two *incomplete* functions:
## 1. leastSquares.func : for fitting an "lm" type model by least squares;
## 2. leastSquaresBootstrap.func : for finding the standard error of the corresponding estimates by nonparametric
## bootstrap.

## Your job is to read and understand the code, complete it, and demonstrate that you get equivalent answers
## to the R function "lm" using the Climate Data.

## NOTE: your estimates from leastSquares.func should be very close (almost identical) to your estimates
## from the "lm" function. Your confidence intervals from leastSquaresBootstrap.func should be similar to
## confidence intervals returned by R for the "lm" model, but they won't be identical because of the sampling
## variability inherent in bootstrapping and because bootstrapping does not give an exact standard error
## calculation.

## ----------------------------------------------------------------------------------------------------------------------
## 1. CODE TEMPLATE for leastSquares.func:
## ----------------------------------------------------------------------------------------------------------------------

leastSquares.func <- function(dat, xvar="x", yvar="y", startvec=c(0, 0)){
        ## leastSquares.func: fit a straight line y=a+b*x to the data points in "dat", which is a data-frame with
        ## columns xvar and yvar, e.g. dat$x and dat$y.
        ## A least-squares fit decides on the best values of a and b by minimizing the sum of squared differences
        ## between the data points in dat$y, and their fitted values on the line, a+b*dat$x.
        ##
        ## dat is a data frame. By default this code looks for columns called "x" and "y", but if your columns have
        ## different names you can enter them into the xvar and yvar arguments.
        ## startvec gives the starting values for the minimisation algorithm: using a=0 and b=0 should work OK here.

        ## Extract the "x" and "y" variables from the data according to the column names you've specified in the
        ## xvar and yvar arguments:
        dat.x <- dat[,xvar]
        dat.y <- dat[,yvar]

        ## Define an inner function for your objective function. This needs to take the single
        ## vector argument "pars". In our case, pars[1] is our first parameter, a, and pars[2] is our
        ## second parameter, b.
        objective.func <- function(pars){
                ## Unpack pars into its components, a and b:
                a <- pars[1]
                b <- pars[2]
                ## The line has equation a + b*x, so for the current values of a and b, the fitted values at the particular
                ## x-values in your dataset are given by a + b*dat.x:
                fittedValues <- a + b*dat.x
                ## Find the sum of squared distances from observations (y) to the fitted values:
                sumSquares <- # *** for you to complete ***

                ## Enter code here to fix up any cases where your objective is NA or infinity:
                                        # *** for you to complete ***

                ## Return the objective, i.e. the quantity you want to minimize:
                return(
                                        # *** for you to complete ***
                       )

        }
        ## This is the end of the inner function.
        ## Now apply the optimizer to find the least-squares estimates of a and b:
        ## "nlm" stands for "non-linear minimization" and is a general way of finding the minimum of
        ## an objective function by repeatedly trying out different parameter inputs to find which ones give the
        ## lowest results (i.e. the least squares). The "nlm" function uses a Newton type algorithm which specifies
        ## how to adjust the parameter inputs at each step to facilitate fast convergence to the minimum.
        ## From a statistician's perspective, all you need to be able to do is to evaluate your objective function
        ## given any parameter inputs (this is sorted out in the function objective.func above), and to give the
        ## nlm algorithm a vector of parameter values to start at: this is startvec. Once it's been given a starting
        ## point, the algorithm will automatically select all other parameter values to try, before it (hopefully)
        ##  converges to the minimum.
        leastSquares.fit <- nlm(f=objective.func, p=startvec)
        ## leastSquares.fit contains your estimates, and other useful items like the gradient at the minimum and
        ## convergence codes. Return the whole list so you can inspect it for any problems:
        return(leastSquares.fit)
}


## ----------------------------------------------------------------------------------------------------------------------
## 2. CODE TEMPLATE for leastSquaresBootstrap.func:
## ----------------------------------------------------------------------------------------------------------------------

leastSquaresBootstrap.func <- function(dat, xvar="x", yvar="y", startvec=c(0, 0), nboot=10000){
        ## leastSquaresBootstrap.func
        ## Uses the bootstrap to estimate standard errors and confidence intervals for a and b.
        ## dat is a data-frame with columns xvar and yvar, and ndat rows.
        ## Each bootstrap sample is a sample of size ndat generated by sampling the rows of
        ## the data-frame dat with replacement.
        ## For every bootstrap sample, we fit the least-squares model again and estimate a and b.
        ## Out of nboot bootstrap samples, this gives us nboot estimates of a, and nboot estimates of b.
        ## The standard errors of a and b are given by the standard deviation of these nboot estimates for
        ## each parameter.
        ## 95% confidence intervals for a and b are given by the 0.025 and 0.975 quantiles of the nboot ordered
        ## estimates for each parameter.

        ## First find the number of data-points in "dat": this is the number of rows in the data-frame.
        ndat <- nrow(dat)

        ## There are various different ways of setting up a loop in R. This way is not the best but it's
        ## simple and good enough for our purposes.
        ## Create a data-frame called "boot.res" to store the estimated values of a and b for each of the
        ## nboot iterations. Populate it with "NA" for each of a and b to start with.
        ## Use column headers "LS.a" and "LS.b" to specify that you're using the least-squares method
        ## for these estimates. (Later, you might want to add additional columns to this data-frame using
        ## another method as part of your additional tasks.)
        boot.res <- data.frame(bootrep=1:nboot, LS.a=rep(NA, nboot), LS.b=rep(NA, nboot))

        ## Now let "i" loop from 1 to nboot. For each i, resample the data, fit the least-squares model, and
        ## store the results:
        for(i in 1:nboot){
                ## Create the bootstrap resampled data, dat.boot, for replicate i.
                ## First resample ndat numbers from 1 to ndat, with replacement. We'll use these as the rows of
                ## the data that we want to retain for sample i. Note that some rows will appear more than
                ## once and others won't appear at all.

                resampleRows <- # *** for you to complete ***

                ## Create the data for the resample: this is easily done by subsetting "dat" by the selected rows:
                dat.boot <- dat[resampleRows, ]

                ## Fit the least-squares model to the resampled data, dat.boot:
                LSfit.boot <- # *** for you to complete ***

                ## Enter the estimated values into row i of boot.res:
                boot.res$LS.a[i] <- LSfit.boot$estimate[1]
                boot.res$LS.b[i] <- LSfit.boot$estimate[2]
                ## End of bootstrap replicate i. Return to the top of the loop for the next bootstrap resample.
        }
        ## We've now finished the loop, so we've filled up the whole of boot.res.
        ## Find the standard errors for each parameter as the sample standard deviation of the bootstrap estimates:
        se.a.LS <- sd(boot.res$LS.a)
        se.b.LS <- # *** for you to complete ***

        ## Find the 95% confidence intervals for a and b as the 2.5% and 97.5% quantiles of the ordered sample:
        CI.a.LS <- quantile(boot.res$LS.a, probs=c(0.025, 0.975))
        CI.b.LS <- # *** for you to complete ***

        ## Return items of interest:
        return(list(stderror=c(LS.a=se.a.LS, LS.b=se.b.LS), CI.a.LS=CI.a.LS, CI.b.LS=CI.b.LS))
}