# Compare predictive ability of different bandwidths using k-fold CV
# Inputs: number of folds, vector of bandwidths, dataframe
# Presumes: data frame contains variables called "gdp.growth" and "gdp"
# Output: vector of cross-validated MSEs for the different bandwidths
# The default bandwidths here are NOT good ones for other problems
cv.growth.folds <- function(nfolds=5, bandwidths=c(0.05,(1:5)/10), df=penn) {
require(np)
case.folds <- rep(1:nfolds,length.out=nrow(df))
# divide the cases as evenly as possible
case.folds <- sample(case.folds) # randomly permute the order
fold.mses <- matrix(0,nrow=nfolds,ncol=length(bandwidths))
colnames(fold.mses) = as.character(bandwidths)
# By naming the columns, we'll won't have to keep track of which bandwidth
# is in which position
for (fold in 1:nfolds) {
# What are the training cases and what are the test cases?
train <- df[case.folds!=fold,]
test <- df[case.folds==fold,]
for (bw in bandwidths) {
# Fit to the training set
# First create a "bandwidth object" with the fixed bandwidth
current.npr.bw <- npregbw(gdp.growth ~ log(gdp), data=train, bws=bw,
bandwidth.compute=FALSE)
# Now actually use it to create the kernel regression
current.npr <- npreg(bws=current.npr.bw)
# Predict on the test set
predictions <- predict(current.npr, newdata=test)
# What's the mean-squared error?
fold.mses[fold,paste(bw)] <- mean((test$gdp.growth - predictions)^2)
# Using paste() here lets us access the column with the right name...
}
}
# Average the MSEs
bandwidths.cv.mses <- colMeans(fold.mses)
return(bandwidths.cv.mses)
}