# Named functions defined for Lecture 20 (smoothing and parametric # specification testing), 36-350, Fall 2009 # Multi-fold cross-validation for univariate kernel regression # Inputs: vector of input feature values; vector of response values; # vector of bandwidths; fraction of data for each training set; # number of folds (folds) # Calls: npreg from the np package # Output: list with components of: best bandwidth, vector of # cross-validated MSEs, array of MSE for each bandwidth on each fold cv_bws_npreg <- function(x,y,bandwidths=(1:50)/50,training.fraction=0.9, folds=10) { require(np) n = length(x) # Sanity-check inputs stopifnot(n> 1, length(y) == n) stopifnot(length(bandwidths) > 1) # CV pointless otherwise stopifnot(training.fraction > 0, training.fraction < 1) stopifnot(folds > 0, folds==trunc(folds)) # Make a matrix to store MSEs for each fold/bandwidth combination fold_MSEs = matrix(0,nrow=folds,ncol=length(bandwidths)) # Name the columns after bandwidths for easy reference later # coerces the numerical bandwidths into character strings colnames(fold_MSEs) = bandwidths n.train = max(1,floor(training.fraction*n)) for (i in 1:folds) { train.rows = sample(1:n,size=n.train,replace=FALSE) x.train = x[train.rows] y.train = y[train.rows] x.test = x[-train.rows] y.test = y[-train.rows] for (bw in bandwidths) { fold_MSEs[i,paste(bw)] <- npreg(txdat=x.train,tydat=y.train,exdat=x.test, eydat=y.test,bws=bw)$MSE # see help(npreg) for arguments to that function # paste(bw): turns numerical bandwidth to type character, so R knows it's # the name of a column, not a column index } } CV_MSEs = colMeans(fold_MSEs) best.bw = bandwidths[which.min(CV_MSEs)] return(list(best.bw=best.bw,CV_MSEs=CV_MSEs,fold_MSEs=fold_MSEs)) } # Line-drawing method for an npregression object # Inputs: npregression object (npr) # index of input dimension to plot (xdim) # optional plotting arguments # Presumes: npr has attributes promised by the npregression class # Outputs: permutation of input values, invisibly # Side-effects: adds lines to current plot lines.npregression <- function(npr,xdim=1,...) { # Ensure we're not called on something of inappropriate class stopifnot("npregression" %in% class(npr)) # Ensure we only plot one dimension, which npr has # TODO: check that this dimension is numerical! stopifnot(length(xdim)==1, xdim %in% (1:npr$ndim)) # Get the x and y values x = npr$eval[,xdim] y = npr$mean # Re-arrange in order of increasing x # TODO: what about ties? May give ugly/weird results if they # exist and there were multiple input dimensions x.ord = order(x) lines(x[x.ord],y[x.ord],...) # Passes job to default method invisible(x.ord) } # Pick npreg bandwidth by k-fold CV and then use it # Inputs: vector of input feature values (x) # vector of response values (y) # vector of bandwidths (bandwidths) # fraction of data for each training set (training.fraction) # number of folds (folds) # optional extra arguments to npreg (...) # Calls: cv_bws_npreg, npreg # Ouputs: fitted npregression object cv_npreg <- function(x,y,bandwidths=(1:50)/50, training.fraction=0.9,folds=10,...) { cv <- cv_bws_npreg(x,y,bandwidths,training.fraction,folds) fit <- npreg(bws=cv$best.bw,txdat=x,tydat=y,...) return(fit) } # One parametric bootstrap value of the test statistic # Inputs: fitted linear model, x values for the test # Calls: cv_npreg # Returns: scalar difference in MSEs calc.T = function(linfit,test.x=x) { y.sim = simulate(linfit)[,1] # Returned as a matrix MSE.p = mean((lm(y.sim~test.x)$residuals)^2) MSE.np = cv_npreg(test.x,y.sim)$MSE return(MSE.p - MSE.np) }