# Compare predictive ability of different bandwidths using k-fold CV # Inputs: number of folds, vector of bandwidths, dataframe # Presumes: data frame contains variables called "gdp.growth" and "gdp" # Output: vector of cross-validated MSEs for the different bandwidths # The default bandwidths here are NOT good ones for other problems cv.growth.folds <- function(nfolds=5, bandwidths=c(0.05,(1:5)/10), df=penn) { require(np) case.folds <- rep(1:nfolds,length.out=nrow(df)) # divide the cases as evenly as possible case.folds <- sample(case.folds) # randomly permute the order fold.mses <- matrix(0,nrow=nfolds,ncol=length(bandwidths)) colnames(fold.mses) = as.character(bandwidths) # By naming the columns, we'll won't have to keep track of which bandwidth # is in which position for (fold in 1:nfolds) { # What are the training cases and what are the test cases? train <- df[case.folds!=fold,] test <- df[case.folds==fold,] for (bw in bandwidths) { # Fit to the training set # First create a "bandwidth object" with the fixed bandwidth current.npr.bw <- npregbw(gdp.growth ~ log(gdp), data=train, bws=bw, bandwidth.compute=FALSE) # Now actually use it to create the kernel regression current.npr <- npreg(bws=current.npr.bw) # Predict on the test set predictions <- predict(current.npr, newdata=test) # What's the mean-squared error? fold.mses[fold,paste(bw)] <- mean((test$gdp.growth - predictions)^2) # Using paste() here lets us access the column with the right name... } } # Average the MSEs bandwidths.cv.mses <- colMeans(fold.mses) return(bandwidths.cv.mses) }