##### Functions of problem 1b


### First approach

# Fit an additive model (without estimating functions)
# Inputs: response vector,
#         matrix of input features,
#         tolerance level (amount by which predictions can change before we 
#         care), defaults to standard error in mean response
#         maximum number of passes over variables for backfitting, default 10
# Presumes: y and x have the same number of rows, no missing/NA values
# Calls: centered.smoothing()
# Outputs: list, with elements:
#          matrix of partial response functions evaluated at the data points,
#          fitted response value at each data point,
#          training input,
#          training response values,
#          number of iterations of backfitting,
#          convergence flag,
#          tolerance level,
#          maximum number of iterations allowed
addm.points = function(y,x,tolerance=sd(y)/sqrt(length(y)),max.passes=10) {
  alpha = mean(y)
  p = ncol(x)
  n = nrow(x)
  f = matrix(0,nrow=n,ncol=p)
     # f is the values of the functions at the training data points.  One column
     # per function.  All functions initialized to zero.
  y.pred = rep(alpha,n) # Initially, we always predict the mean

  # Iterate back-fitting until we no longer really change the predictions, or
  # until we max out on iterations
  max.iterations = max.passes*p
  made.changes = TRUE
  iterations = 0
  j = 0
  while(made.changes & (iterations < max.iterations)) {
    # Increment the count of the number of iterations
    iterations = iterations + 1
    # Increment the variable we're updating; keep it from getting out of bounds
    j = ifelse(j == p,1,j+1)
    # Calculate the partial residuals for this feature.
    # (Only use one set of partial residuals at a time, so a vector works)
    partial.residuals = y - (y.pred - f[,j])
      # partial residual = truth - (prediction - this function)
    # Smooth the partial residuals
    new.fj = centered.smoothing(x[,j],partial.residuals)
    # Update predictions, see if they've changed
    new.y.pred = y.pred + (new.fj - f[,j])
      # That is, the new prediction is the old prediction, plus the difference between
      # the old and new partial-response function
        # Could update by summing up each row in f (first updating column j), but
        # most columns aren't changing, so this wastes a lot of time
    f[,j] = new.fj
    # Count how many predictions changed more than we can tolerate
    number.changes = sum(abs(new.y.pred - y.pred) > tolerance)
    # If that's more than zero, then we made a significant change
    made.changes = (number.changes > 0)
    # Replace the old predictions with the new ones
    y.pred = new.y.pred
  }
  # Issue a warning if we left the loop without converging
  if (made.changes) {
    warning("Exited addm.points before backfitting converged")
  }
  # Return a list with the partial response functions (f), the fitted values,
  # the inputs, the actual response, the number of iterations, the tolerance,
  # the maximum number of iterations allowed, and whether back-fitting
  # converged.
  return(list(mean.response=alpha,partial.responses=f,
         fitted=alpha+apply(f,1,sum),x=x,y=y,
         iterations=iterations,converged=!made.changes,tolerance=tolerance,
         max.iterations=max.iterations))
}

# Compute a centered (mean-zero) smoothing
# Input: vector of input values, vector of response values
# Calls: ksmooth(), inverse.permutation()
# Output: smmothed vector of fitted response values, with sample mean of 0
centered.smoothing = function(x,y) { 
  g = ksmooth(x,y,n.points=length(x),x.points=x)$y
  # PROBLEM: ksmooth returns its predictions with x _sorted_ in increasing
  # order, not the order of the data
  # SOLUTION: write a function which undoes this permutation
  inv.sort.of.x = inverse.permutation(order(x))
  g = g[inv.sort.of.x] # puts g back into data order
  # Update the estimated function, remembering to subtract off the mean
  # so that the sample average of each function stays zero
  return(g - mean(g))
}

# Invert a permutation vector
# Input: a permutation of a vector, represented as a vector of positive integers
#        with each integer from 1 to the vector's length appearing exactly once
# Presumes: input satsifies that condition
# Output: the inverse permutation
inverse.permutation = function(forward.permutation) {
  # Slow way with explicit iteration behind double-comments
  ## inv.perm = vector(length=length(forward.permutation))
  ## for (i in 1:length(forward.permutation)) {
  ##   inv.perm[i] = min(which(forward.permutation == i))
  ## }
  # The "match" function returns the indices of the first occurrences of its
  # first argument in its second argument.
  inv.perm = match(1:length(forward.permutation),forward.permutation)
  return(inv.perm)
}


####### Second approach

# Fit an additive model (of type "addm"), estimating actual functions which
# can be evaluated anywhere
  # Uses back-fitting to estimate partial response functions (with loess())
  # Some R trickery to get functions to bind properly
# Inputs: response vector,
#         matrix of input variables,
#         tolerance for predictions (how close successive predictions have to be
#         before we say they've converged),
#         maximum number of passes over input features
# Calls: make.addm(), predict.addm(), loess()
# Output: additive model ("addm" object), including partial response functions,
#         mean response, training data, fitted values, and estimation diagnostics
addm.functions = function(y,x,tolerance=sd(y)/sqrt(length(y)),                           max.passes=10) {  alpha = mean(y) # Over-all mean response  p = ncol(x) # Number of inputs  n = nrow(x) # Number of data points  # Initialize the partial response functions to all be constant at zero.
  # Define a constant-at-zero function (takes vector input, gives vector output)  constant.0.fn = function(x) {return(rep(0,length(x)))}
  # Store all the partial response functions in a list    # The built-in "rep" command does not work with function objects, and I don't
    # feel like fixing this  f = list(constant.0.fn) # Creates a one-element list  for (j in (2:p)) { f[[j]] = constant.0.fn } # Expands the list as needed  # Initialize vectors for the predicted responses; start by just predicting mean
  y.pred = rep(alpha,n)  # Iterate back-fitting until predictions converge, or we max out on iterations
  max.iterations = max.passes*p  made.changes = TRUE
    # Tracks whether any predictions have changed; initialize to TRUE to start
    # the back-fitting loop  iterations = 0 # Tracks how many back-fitting iterations we've made
  j = 0 # Tracks which feature we're fitting
  while(made.changes & (iterations < max.iterations)) {    iterations = iterations + 1 # Increment the iterations counter
    j = ifelse(j==p,1,j+1)    # Now back-fit
    input.var = x[,j]    # Calculate the partial residuals w.r.t. this variable    partial.residuals = y - (y.pred - f[[j]](input.var))
      # partial residual = truth - full prediction + current function
    # Update the partial response function
    new.fj = centered.smoothing.function(input.var,partial.residuals)
      # We only ever have one of these around, just like the partial residuals
    # Update predictions, check if they've changed
    new.y.pred = y.pred + (new.fj(input.var) - f[[j]](input.var))
       # new prediction = old prediction + (change in re-estimated function)
    # Count how many predictions changed more than the tolerance    number.changes = sum(abs(new.y.pred - y.pred) > tolerance)    # If that's more than zero, then we have not converged    made.changes = (number.changes > 0)    # Replace the old estimated functions and predictions    f[[j]] = new.fj    y.pred = new.y.pred  }  # Did we exit the loop because we converged, or because we maxed out?  if (made.changes) {    # We exited the loop before we converged    warning("Exited addm.functions before backfitting converged")  }  # Return an object type "addm" (containing the list of partial response  # functions, the mean response and the matrix of inputs in fitting), plus the  # fitted response values, actual response values, number of back-fitting 
  # iterations, the tolerance used in fitting, the maximum number of iterations 
  # allowed in fitting, and a Boolean flag for whether the back-fitting 
  # converged. (Most of these are just diagnostics.)  return(make.addm(partial.responses=f,mean.response=alpha,x=x,                  fitted=y.pred,y=y,                  iterations=iterations,tolerance=tolerance,                  max.iterations=max.iterations, converged=!made.changes))}

# Smooth data (by loess), center the estimated function, and return the function
# Inputs: vector of input values,
#         vector of responses
# Calls: loess()
# Outputs: function which takes a vector of inputs and gives a vector of predictions
centered.smoothing.function = function(x,y) {  g = loess(y ~ x, surface="direct",degree=1)
  centering.constant = mean(predict(g))
    # Evaluate this once, now, and insert it as a constant into the function
    # definition  return(function(u) { predict(g,newdata=u) - centering.constant } )}

# Create an object of the type "addm", representing an additive model
  # Contains: a list, including the partial response functions, the over-all
  # mean response, and the input values used in training.  May also contain
  # other list elements.
# Inputs: partial response functions (generally a list of functions),
#         mean response (generally a single number),
#         training inputs (generally a matrix),
#         optional extra key/value pairs
# Outputs: an object of type "addm"
make.addm = function(partial.responses,mean.response,x,...) {  my.addm = list(partial.responses=partial.responses,               mean.response=mean.response,x=x,...)  class(my.addm) = "addm"  return(my.addm)}

# Predict mean responses from an additive model, of type "addm"
# Inputs: additive model ("addm" object),
#         matrix of input variables on which to predict (optional; defaults to
#         training set of first argument)
# Presumes: additive model has all the components created in make.addm
#           the partial.responses attribute of the addm is a list of functions
#           newdata has same number of columns as training data
# Output: vector of numerical predictions
predict.addm = function(am,newdata=am$x) {  p = ncol(newdata) # Should test whether this is the number of input variables
                    # expected by am  n = nrow(newdata)  alpha = am$mean.response  f = am$partial.responses  f.matrix = matrix(0,nrow=n,ncol=p)  for (j in (1:p)) {    f.matrix[,j] = f[[j]](newdata[,j])  }  y.pred = alpha + apply(f.matrix,1,sum)  return(y.pred)}


############# Code for problem 2b

# Do one cross-validation run for picking bandwidth in ksmooth
# Inputs: vector of x values,
#         vector of y values,
#         proportion of data to use in training
#         vector giving grid of bandwidths
# Calls: ksmooth(), sample()
# Outputs: vector giving mean-squared error on testing set at each bandwidth
one.cv.run.ksmooth = function(x,y,p,h.grid) {  # Should test that 0 < p =< 1  n = length(x) # Should test that x and y have the same length!  n.sample = round(n*p) # Should test that this is > 0  train.rows = sample(1:n,n.sample)  train.x = x[train.rows]  train.y = y[train.rows]  test.x = x[-train.rows]  test.y = y[-train.rows]  # ksmooth orders its output by increasing x --- to avoid confusion let's  # take care of that ourselves  test.y = test.y[order(test.x)] # Sorts y values in order of increasing x  test.x = sort(test.x) # Puts x itself in order  mse = vector(length=length(h.grid))  for (i in 1:length(h.grid)) {    y.pred = ksmooth(train.x,train.y,kernel="normal",bandwidth=h.grid[i],              range.x=range(x),n.points=length(test.x),x.points=test.x)$y    mse[i] = mean((test.y - y.pred)^2)  }  return(mse)}

# Use k-fold CV to get out-of-sample error for multiple bandwidths in ksmooth 
# Inputs: vector of x values,
#         vector of y values
#         proportion of data to use in training
#         vector of grid of bandwidths
#         number of folds of CV
# Calls: one.cv.run.ksmooth()
# Outputs: list with best bandwidth,
#          original grid of bandwidths,
#          and vector of average out-of-sample error for each bandwidth
k.fold.cv.ksmooth = function(x,y,p,h.grid,k) {  k.fold.results = replicate(k,one.cv.run.ksmooth(x,y,p,h.grid))  mean.over.k.fold = apply(k.fold.results,1,mean)  return(list(best.h = h.grid[which.min(mean.over.k.fold)],h.grid=h.grid,
              mean.mse.over.k.fold = mean.over.k.fold))}