##### Functions of problem 1b
### First approach
# Fit an additive model (without estimating functions)
# Inputs: response vector,
# matrix of input features,
# tolerance level (amount by which predictions can change before we
# care), defaults to standard error in mean response
# maximum number of passes over variables for backfitting, default 10
# Presumes: y and x have the same number of rows, no missing/NA values
# Calls: centered.smoothing()
# Outputs: list, with elements:
# matrix of partial response functions evaluated at the data points,
# fitted response value at each data point,
# training input,
# training response values,
# number of iterations of backfitting,
# convergence flag,
# tolerance level,
# maximum number of iterations allowed
addm.points = function(y,x,tolerance=sd(y)/sqrt(length(y)),max.passes=10) {
alpha = mean(y)
p = ncol(x)
n = nrow(x)
f = matrix(0,nrow=n,ncol=p)
# f is the values of the functions at the training data points. One column
# per function. All functions initialized to zero.
y.pred = rep(alpha,n) # Initially, we always predict the mean
# Iterate back-fitting until we no longer really change the predictions, or
# until we max out on iterations
max.iterations = max.passes*p
made.changes = TRUE
iterations = 0
j = 0
while(made.changes & (iterations < max.iterations)) {
# Increment the count of the number of iterations
iterations = iterations + 1
# Increment the variable we're updating; keep it from getting out of bounds
j = ifelse(j == p,1,j+1)
# Calculate the partial residuals for this feature.
# (Only use one set of partial residuals at a time, so a vector works)
partial.residuals = y - (y.pred - f[,j])
# partial residual = truth - (prediction - this function)
# Smooth the partial residuals
new.fj = centered.smoothing(x[,j],partial.residuals)
# Update predictions, see if they've changed
new.y.pred = y.pred + (new.fj - f[,j])
# That is, the new prediction is the old prediction, plus the difference between
# the old and new partial-response function
# Could update by summing up each row in f (first updating column j), but
# most columns aren't changing, so this wastes a lot of time
f[,j] = new.fj
# Count how many predictions changed more than we can tolerate
number.changes = sum(abs(new.y.pred - y.pred) > tolerance)
# If that's more than zero, then we made a significant change
made.changes = (number.changes > 0)
# Replace the old predictions with the new ones
y.pred = new.y.pred
}
# Issue a warning if we left the loop without converging
if (made.changes) {
warning("Exited addm.points before backfitting converged")
}
# Return a list with the partial response functions (f), the fitted values,
# the inputs, the actual response, the number of iterations, the tolerance,
# the maximum number of iterations allowed, and whether back-fitting
# converged.
return(list(mean.response=alpha,partial.responses=f,
fitted=alpha+apply(f,1,sum),x=x,y=y,
iterations=iterations,converged=!made.changes,tolerance=tolerance,
max.iterations=max.iterations))
}
# Compute a centered (mean-zero) smoothing
# Input: vector of input values, vector of response values
# Calls: ksmooth(), inverse.permutation()
# Output: smmothed vector of fitted response values, with sample mean of 0
centered.smoothing = function(x,y) {
g = ksmooth(x,y,n.points=length(x),x.points=x)$y
# PROBLEM: ksmooth returns its predictions with x _sorted_ in increasing
# order, not the order of the data
# SOLUTION: write a function which undoes this permutation
inv.sort.of.x = inverse.permutation(order(x))
g = g[inv.sort.of.x] # puts g back into data order
# Update the estimated function, remembering to subtract off the mean
# so that the sample average of each function stays zero
return(g - mean(g))
}
# Invert a permutation vector
# Input: a permutation of a vector, represented as a vector of positive integers
# with each integer from 1 to the vector's length appearing exactly once
# Presumes: input satsifies that condition
# Output: the inverse permutation
inverse.permutation = function(forward.permutation) {
# Slow way with explicit iteration behind double-comments
## inv.perm = vector(length=length(forward.permutation))
## for (i in 1:length(forward.permutation)) {
## inv.perm[i] = min(which(forward.permutation == i))
## }
# The "match" function returns the indices of the first occurrences of its
# first argument in its second argument.
inv.perm = match(1:length(forward.permutation),forward.permutation)
return(inv.perm)
}
####### Second approach
# Fit an additive model (of type "addm"), estimating actual functions which
# can be evaluated anywhere
# Uses back-fitting to estimate partial response functions (with loess())
# Some R trickery to get functions to bind properly
# Inputs: response vector,
# matrix of input variables,
# tolerance for predictions (how close successive predictions have to be
# before we say they've converged),
# maximum number of passes over input features
# Calls: make.addm(), predict.addm(), loess()
# Output: additive model ("addm" object), including partial response functions,
# mean response, training data, fitted values, and estimation diagnostics
addm.functions = function(y,x,tolerance=sd(y)/sqrt(length(y)),
max.passes=10) {
alpha = mean(y) # Over-all mean response
p = ncol(x) # Number of inputs
n = nrow(x) # Number of data points
# Initialize the partial response functions to all be constant at zero.
# Define a constant-at-zero function (takes vector input, gives vector output)
constant.0.fn = function(x) {return(rep(0,length(x)))}
# Store all the partial response functions in a list
# The built-in "rep" command does not work with function objects, and I don't
# feel like fixing this
f = list(constant.0.fn) # Creates a one-element list
for (j in (2:p)) { f[[j]] = constant.0.fn } # Expands the list as needed
# Initialize vectors for the predicted responses; start by just predicting mean
y.pred = rep(alpha,n)
# Iterate back-fitting until predictions converge, or we max out on iterations
max.iterations = max.passes*p
made.changes = TRUE
# Tracks whether any predictions have changed; initialize to TRUE to start
# the back-fitting loop
iterations = 0 # Tracks how many back-fitting iterations we've made
j = 0 # Tracks which feature we're fitting
while(made.changes & (iterations < max.iterations)) {
iterations = iterations + 1 # Increment the iterations counter
j = ifelse(j==p,1,j+1)
# Now back-fit
input.var = x[,j]
# Calculate the partial residuals w.r.t. this variable
partial.residuals = y - (y.pred - f[[j]](input.var))
# partial residual = truth - full prediction + current function
# Update the partial response function
new.fj = centered.smoothing.function(input.var,partial.residuals)
# We only ever have one of these around, just like the partial residuals
# Update predictions, check if they've changed
new.y.pred = y.pred + (new.fj(input.var) - f[[j]](input.var))
# new prediction = old prediction + (change in re-estimated function)
# Count how many predictions changed more than the tolerance
number.changes = sum(abs(new.y.pred - y.pred) > tolerance)
# If that's more than zero, then we have not converged
made.changes = (number.changes > 0)
# Replace the old estimated functions and predictions
f[[j]] = new.fj
y.pred = new.y.pred
}
# Did we exit the loop because we converged, or because we maxed out?
if (made.changes) {
# We exited the loop before we converged
warning("Exited addm.functions before backfitting converged")
}
# Return an object type "addm" (containing the list of partial response
# functions, the mean response and the matrix of inputs in fitting), plus the
# fitted response values, actual response values, number of back-fitting
# iterations, the tolerance used in fitting, the maximum number of iterations
# allowed in fitting, and a Boolean flag for whether the back-fitting
# converged. (Most of these are just diagnostics.)
return(make.addm(partial.responses=f,mean.response=alpha,x=x,
fitted=y.pred,y=y,
iterations=iterations,tolerance=tolerance,
max.iterations=max.iterations, converged=!made.changes))
}
# Smooth data (by loess), center the estimated function, and return the function
# Inputs: vector of input values,
# vector of responses
# Calls: loess()
# Outputs: function which takes a vector of inputs and gives a vector of predictions
centered.smoothing.function = function(x,y) {
g = loess(y ~ x, surface="direct",degree=1)
centering.constant = mean(predict(g))
# Evaluate this once, now, and insert it as a constant into the function
# definition
return(function(u) { predict(g,newdata=u) - centering.constant } )
}
# Create an object of the type "addm", representing an additive model
# Contains: a list, including the partial response functions, the over-all
# mean response, and the input values used in training. May also contain
# other list elements.
# Inputs: partial response functions (generally a list of functions),
# mean response (generally a single number),
# training inputs (generally a matrix),
# optional extra key/value pairs
# Outputs: an object of type "addm"
make.addm = function(partial.responses,mean.response,x,...) {
my.addm = list(partial.responses=partial.responses,
mean.response=mean.response,x=x,...)
class(my.addm) = "addm"
return(my.addm)
}
# Predict mean responses from an additive model, of type "addm"
# Inputs: additive model ("addm" object),
# matrix of input variables on which to predict (optional; defaults to
# training set of first argument)
# Presumes: additive model has all the components created in make.addm
# the partial.responses attribute of the addm is a list of functions
# newdata has same number of columns as training data
# Output: vector of numerical predictions
predict.addm = function(am,newdata=am$x) {
p = ncol(newdata) # Should test whether this is the number of input variables
# expected by am
n = nrow(newdata)
alpha = am$mean.response
f = am$partial.responses
f.matrix = matrix(0,nrow=n,ncol=p)
for (j in (1:p)) {
f.matrix[,j] = f[[j]](newdata[,j])
}
y.pred = alpha + apply(f.matrix,1,sum)
return(y.pred)
}
############# Code for problem 2b
# Do one cross-validation run for picking bandwidth in ksmooth
# Inputs: vector of x values,
# vector of y values,
# proportion of data to use in training
# vector giving grid of bandwidths
# Calls: ksmooth(), sample()
# Outputs: vector giving mean-squared error on testing set at each bandwidth
one.cv.run.ksmooth = function(x,y,p,h.grid) {
# Should test that 0 < p =< 1
n = length(x) # Should test that x and y have the same length!
n.sample = round(n*p) # Should test that this is > 0
train.rows = sample(1:n,n.sample)
train.x = x[train.rows]
train.y = y[train.rows]
test.x = x[-train.rows]
test.y = y[-train.rows]
# ksmooth orders its output by increasing x --- to avoid confusion let's
# take care of that ourselves
test.y = test.y[order(test.x)] # Sorts y values in order of increasing x
test.x = sort(test.x) # Puts x itself in order
mse = vector(length=length(h.grid))
for (i in 1:length(h.grid)) {
y.pred = ksmooth(train.x,train.y,kernel="normal",bandwidth=h.grid[i],
range.x=range(x),n.points=length(test.x),x.points=test.x)$y
mse[i] = mean((test.y - y.pred)^2)
}
return(mse)
}
# Use k-fold CV to get out-of-sample error for multiple bandwidths in ksmooth
# Inputs: vector of x values,
# vector of y values
# proportion of data to use in training
# vector of grid of bandwidths
# number of folds of CV
# Calls: one.cv.run.ksmooth()
# Outputs: list with best bandwidth,
# original grid of bandwidths,
# and vector of average out-of-sample error for each bandwidth
k.fold.cv.ksmooth = function(x,y,p,h.grid,k) {
k.fold.results = replicate(k,one.cv.run.ksmooth(x,y,p,h.grid))
mean.over.k.fold = apply(k.fold.results,1,mean)
return(list(best.h = h.grid[which.min(mean.over.k.fold)],h.grid=h.grid,
mean.mse.over.k.fold = mean.over.k.fold))
}