##### Functions of problem 1b ### First approach # Fit an additive model (without estimating functions) # Inputs: response vector, # matrix of input features, # tolerance level (amount by which predictions can change before we # care), defaults to standard error in mean response # maximum number of passes over variables for backfitting, default 10 # Presumes: y and x have the same number of rows, no missing/NA values # Calls: centered.smoothing() # Outputs: list, with elements: # matrix of partial response functions evaluated at the data points, # fitted response value at each data point, # training input, # training response values, # number of iterations of backfitting, # convergence flag, # tolerance level, # maximum number of iterations allowed addm.points = function(y,x,tolerance=sd(y)/sqrt(length(y)),max.passes=10) { alpha = mean(y) p = ncol(x) n = nrow(x) f = matrix(0,nrow=n,ncol=p) # f is the values of the functions at the training data points. One column # per function. All functions initialized to zero. y.pred = rep(alpha,n) # Initially, we always predict the mean # Iterate back-fitting until we no longer really change the predictions, or # until we max out on iterations max.iterations = max.passes*p made.changes = TRUE iterations = 0 j = 0 while(made.changes & (iterations < max.iterations)) { # Increment the count of the number of iterations iterations = iterations + 1 # Increment the variable we're updating; keep it from getting out of bounds j = ifelse(j == p,1,j+1) # Calculate the partial residuals for this feature. # (Only use one set of partial residuals at a time, so a vector works) partial.residuals = y - (y.pred - f[,j]) # partial residual = truth - (prediction - this function) # Smooth the partial residuals new.fj = centered.smoothing(x[,j],partial.residuals) # Update predictions, see if they've changed new.y.pred = y.pred + (new.fj - f[,j]) # That is, the new prediction is the old prediction, plus the difference between # the old and new partial-response function # Could update by summing up each row in f (first updating column j), but # most columns aren't changing, so this wastes a lot of time f[,j] = new.fj # Count how many predictions changed more than we can tolerate number.changes = sum(abs(new.y.pred - y.pred) > tolerance) # If that's more than zero, then we made a significant change made.changes = (number.changes > 0) # Replace the old predictions with the new ones y.pred = new.y.pred } # Issue a warning if we left the loop without converging if (made.changes) { warning("Exited addm.points before backfitting converged") } # Return a list with the partial response functions (f), the fitted values, # the inputs, the actual response, the number of iterations, the tolerance, # the maximum number of iterations allowed, and whether back-fitting # converged. return(list(mean.response=alpha,partial.responses=f, fitted=alpha+apply(f,1,sum),x=x,y=y, iterations=iterations,converged=!made.changes,tolerance=tolerance, max.iterations=max.iterations)) } # Compute a centered (mean-zero) smoothing # Input: vector of input values, vector of response values # Calls: ksmooth(), inverse.permutation() # Output: smmothed vector of fitted response values, with sample mean of 0 centered.smoothing = function(x,y) { g = ksmooth(x,y,n.points=length(x),x.points=x)$y # PROBLEM: ksmooth returns its predictions with x _sorted_ in increasing # order, not the order of the data # SOLUTION: write a function which undoes this permutation inv.sort.of.x = inverse.permutation(order(x)) g = g[inv.sort.of.x] # puts g back into data order # Update the estimated function, remembering to subtract off the mean # so that the sample average of each function stays zero return(g - mean(g)) } # Invert a permutation vector # Input: a permutation of a vector, represented as a vector of positive integers # with each integer from 1 to the vector's length appearing exactly once # Presumes: input satsifies that condition # Output: the inverse permutation inverse.permutation = function(forward.permutation) { # Slow way with explicit iteration behind double-comments ## inv.perm = vector(length=length(forward.permutation)) ## for (i in 1:length(forward.permutation)) { ## inv.perm[i] = min(which(forward.permutation == i)) ## } # The "match" function returns the indices of the first occurrences of its # first argument in its second argument. inv.perm = match(1:length(forward.permutation),forward.permutation) return(inv.perm) } ####### Second approach # Fit an additive model (of type "addm"), estimating actual functions which # can be evaluated anywhere # Uses back-fitting to estimate partial response functions (with loess()) # Some R trickery to get functions to bind properly # Inputs: response vector, # matrix of input variables, # tolerance for predictions (how close successive predictions have to be # before we say they've converged), # maximum number of passes over input features # Calls: make.addm(), predict.addm(), loess() # Output: additive model ("addm" object), including partial response functions, # mean response, training data, fitted values, and estimation diagnostics addm.functions = function(y,x,tolerance=sd(y)/sqrt(length(y)), max.passes=10) { alpha = mean(y) # Over-all mean response p = ncol(x) # Number of inputs n = nrow(x) # Number of data points # Initialize the partial response functions to all be constant at zero. # Define a constant-at-zero function (takes vector input, gives vector output) constant.0.fn = function(x) {return(rep(0,length(x)))} # Store all the partial response functions in a list # The built-in "rep" command does not work with function objects, and I don't # feel like fixing this f = list(constant.0.fn) # Creates a one-element list for (j in (2:p)) { f[[j]] = constant.0.fn } # Expands the list as needed # Initialize vectors for the predicted responses; start by just predicting mean y.pred = rep(alpha,n) # Iterate back-fitting until predictions converge, or we max out on iterations max.iterations = max.passes*p made.changes = TRUE # Tracks whether any predictions have changed; initialize to TRUE to start # the back-fitting loop iterations = 0 # Tracks how many back-fitting iterations we've made j = 0 # Tracks which feature we're fitting while(made.changes & (iterations < max.iterations)) { iterations = iterations + 1 # Increment the iterations counter j = ifelse(j==p,1,j+1) # Now back-fit input.var = x[,j] # Calculate the partial residuals w.r.t. this variable partial.residuals = y - (y.pred - f[[j]](input.var)) # partial residual = truth - full prediction + current function # Update the partial response function new.fj = centered.smoothing.function(input.var,partial.residuals) # We only ever have one of these around, just like the partial residuals # Update predictions, check if they've changed new.y.pred = y.pred + (new.fj(input.var) - f[[j]](input.var)) # new prediction = old prediction + (change in re-estimated function) # Count how many predictions changed more than the tolerance number.changes = sum(abs(new.y.pred - y.pred) > tolerance) # If that's more than zero, then we have not converged made.changes = (number.changes > 0) # Replace the old estimated functions and predictions f[[j]] = new.fj y.pred = new.y.pred } # Did we exit the loop because we converged, or because we maxed out? if (made.changes) { # We exited the loop before we converged warning("Exited addm.functions before backfitting converged") } # Return an object type "addm" (containing the list of partial response # functions, the mean response and the matrix of inputs in fitting), plus the # fitted response values, actual response values, number of back-fitting # iterations, the tolerance used in fitting, the maximum number of iterations # allowed in fitting, and a Boolean flag for whether the back-fitting # converged. (Most of these are just diagnostics.) return(make.addm(partial.responses=f,mean.response=alpha,x=x, fitted=y.pred,y=y, iterations=iterations,tolerance=tolerance, max.iterations=max.iterations, converged=!made.changes)) } # Smooth data (by loess), center the estimated function, and return the function # Inputs: vector of input values, # vector of responses # Calls: loess() # Outputs: function which takes a vector of inputs and gives a vector of predictions centered.smoothing.function = function(x,y) { g = loess(y ~ x, surface="direct",degree=1) centering.constant = mean(predict(g)) # Evaluate this once, now, and insert it as a constant into the function # definition return(function(u) { predict(g,newdata=u) - centering.constant } ) } # Create an object of the type "addm", representing an additive model # Contains: a list, including the partial response functions, the over-all # mean response, and the input values used in training. May also contain # other list elements. # Inputs: partial response functions (generally a list of functions), # mean response (generally a single number), # training inputs (generally a matrix), # optional extra key/value pairs # Outputs: an object of type "addm" make.addm = function(partial.responses,mean.response,x,...) { my.addm = list(partial.responses=partial.responses, mean.response=mean.response,x=x,...) class(my.addm) = "addm" return(my.addm) } # Predict mean responses from an additive model, of type "addm" # Inputs: additive model ("addm" object), # matrix of input variables on which to predict (optional; defaults to # training set of first argument) # Presumes: additive model has all the components created in make.addm # the partial.responses attribute of the addm is a list of functions # newdata has same number of columns as training data # Output: vector of numerical predictions predict.addm = function(am,newdata=am$x) { p = ncol(newdata) # Should test whether this is the number of input variables # expected by am n = nrow(newdata) alpha = am$mean.response f = am$partial.responses f.matrix = matrix(0,nrow=n,ncol=p) for (j in (1:p)) { f.matrix[,j] = f[[j]](newdata[,j]) } y.pred = alpha + apply(f.matrix,1,sum) return(y.pred) } ############# Code for problem 2b # Do one cross-validation run for picking bandwidth in ksmooth # Inputs: vector of x values, # vector of y values, # proportion of data to use in training # vector giving grid of bandwidths # Calls: ksmooth(), sample() # Outputs: vector giving mean-squared error on testing set at each bandwidth one.cv.run.ksmooth = function(x,y,p,h.grid) { # Should test that 0 < p =< 1 n = length(x) # Should test that x and y have the same length! n.sample = round(n*p) # Should test that this is > 0 train.rows = sample(1:n,n.sample) train.x = x[train.rows] train.y = y[train.rows] test.x = x[-train.rows] test.y = y[-train.rows] # ksmooth orders its output by increasing x --- to avoid confusion let's # take care of that ourselves test.y = test.y[order(test.x)] # Sorts y values in order of increasing x test.x = sort(test.x) # Puts x itself in order mse = vector(length=length(h.grid)) for (i in 1:length(h.grid)) { y.pred = ksmooth(train.x,train.y,kernel="normal",bandwidth=h.grid[i], range.x=range(x),n.points=length(test.x),x.points=test.x)$y mse[i] = mean((test.y - y.pred)^2) } return(mse) } # Use k-fold CV to get out-of-sample error for multiple bandwidths in ksmooth # Inputs: vector of x values, # vector of y values # proportion of data to use in training # vector of grid of bandwidths # number of folds of CV # Calls: one.cv.run.ksmooth() # Outputs: list with best bandwidth, # original grid of bandwidths, # and vector of average out-of-sample error for each bandwidth k.fold.cv.ksmooth = function(x,y,p,h.grid,k) { k.fold.results = replicate(k,one.cv.run.ksmooth(x,y,p,h.grid)) mean.over.k.fold = apply(k.fold.results,1,mean) return(list(best.h = h.grid[which.min(mean.over.k.fold)],h.grid=h.grid, mean.mse.over.k.fold = mean.over.k.fold)) }