# Homework Assignment 2: The Advantages of Backwardness # 36-402, Data Analysis, Spring 2011 # Solutions by Z. Kurtz # Setup install.packages("np") require(np) data(oecdpanel) # Take a quick look at the variables names(oecdpanel) plot(oecdpanel) ############################################################### # PROBLEM 1: # Fit a linear model of growth on initgdp lm.1 = lm(growth ~ initgdp, data = oecdpanel) lm.1$coefficients ############################################################### # PROBLEM 2: # Use provided code to explore dependence of MSE on bandwidth # for a kernel regression of growth on initgdp # run the supplied code source("hw-02.R") # modify the inner loop of supplied code for the "whole data" case (non-CV) whole.MSE = rep(NA, length(bandwidths)) for (bw in bandwidths) { # Fit to the training set = whole data current.npr = npreg(growth ~ initgdp, data=oecdpanel,bws=bw) # Predict on the test set = whole data, so predictions are just fitted values predictions = fitted(current.npr) # What's the mean-squared error? whole.MSE[which(bandwidths == bw)] = mean((oecdpanel$growth - predictions)^2) # Note the alternative to the paste() trick for accessing bandwidths # Also: one part of the objets returned by npreg is an MSE attribute, so # we could have done # whole.MSE(which(bandwidths == bw)] = current.npr$MSE # but the more explicit approach will also work if we decide to start using # a different smoothing package } # display the results (bandwidths.cv.mses and whole.MSE) pdf("graphics/p2.pdf") plot(bandwidths, bandwidths.cv.mses, type = "l", lwd = 2, ylim = c(min(c(bandwidths.cv.mses, whole.MSE)), max(c(bandwidths.cv.mses, whole.MSE))), main = "MSE vs. bandwidth \n5-fold CV (black, solid) and in-sample (green, dashed)", ylab = "MSE", xlab = "Kernel bandwidth") lines(bandwidths, whole.MSE, lwd = 2, col = "green", lty=2) dev.off() ############################################################### # PROBLEM 3: pdf("graphics/p3.pdf") # Make scatterplot of initgdp versus growth: plot(oecdpanel$initgdp, oecdpanel$growth, lwd = 2, cex = 0.3, xlab = "initgdp", ylab = "growth", main = "growth vs. initgdp") # Add the line for the linear model: abline(lm.1, lwd = 2, col = "red") # Fit the kernel regression with the [approximately] optimal bandwidth 0.3: opt.reg = npreg(growth ~ initgdp, data=oecdpanel,bws=0.3) fitted.values = fitted(opt.reg) #equivelently: fitted.values = predict(opt.reg, newdata = oecdpanel) points(oecdpanel$initgdp, fitted.values, col = "green", cex = 0.5, lwd = 1.8) dev.off() ############################################################### # PROBLEM 4: # Fit a linear model of growth on initgdp, popgro, and inv lm.4 = lm(growth ~ initgdp+popgro+inv, data = oecdpanel) signif(lm.4$coefficients,3) ############################################################### # PROBLEM 5: oecd.npr <- npreg(growth ~ initgdp + popgro + inv, data=oecdpanel, tol=0.1, ftol=0.1) summary(oecd.npr) ############################################################### # PROBLEM 6: # Find and report the medians: med.popgro = median(oecdpanel$popgro) signif(med.popgro,3) med.inv = median(oecdpanel$inv) signif(med.inv,3) # Make a new data set med.oecdpanel that contains growth, initgdp, popgro, # and inv, but replace popgro and inv and with their respective medians # Use the actual values of initgdp, but sort them for easier plotting later initgdp.sort = sort(oecdpanel$initgdp) med.oecdpanel = data.frame(initgdp=initgdp.sort, popgro=med.popgro,inv=med.inv) # Find the predicted growth rates under the model from problem 4 mod4.med.preds = predict(lm.4, newdata = med.oecdpanel) # Find the predicted growth rates under the model from problem 5 mod5.med.preds = predict(oecd.npr, newdata = med.oecdpanel) # Plot growth v.s. initgdp for med.data and add the fitted values for both models: pdf("graphics/p6.pdf") # Make scatterplot of initgdp versus growth: plot(oecdpanel$initgdp, oecdpanel$growth, lwd = 2, cex = 0.3, xlab = "initgdp", ylab = "growth", main = "Data with popgro, inv set to their medians: growth vs. initgdp") lines(initgdp.sort, mod4.med.preds, col = "red", cex = 0.5, lwd = 2.8) lines(initgdp.sort, mod5.med.preds, col = "gray", cex = 0.5, lwd = 2.8) mtext("Red straight = linear model fit; Gray curve = kernel regression fit") dev.off() ############################################################### # PROBLEM 7: # Code from problem 2 with some modifications for doing CV with both the linear # model and the kernel regression (with automatic bandwidth selection) nfolds = 5 case.folds = rep(1:nfolds,length.out=nrow(oecdpanel)) case.folds = sample(case.folds) linear.fold.mses = vector(length=nfolds) kernel.fold.mses = vector(length=nfolds) for (fold in 1:nfolds) { train = oecdpanel[case.folds!=fold,] test = oecdpanel[case.folds==fold,] # Fit the models linear.model = lm(growth ~ initgdp+popgro+inv, data=train) kernel.model = npreg(growth ~ initgdp + popgro + inv, data=train, tol=0.1, ftol=0.1) # Predict on the test set linear.predictions = predict(linear.model, newdata=test) kernel.predictions = predict(kernel.model, newdata=test) linear.fold.mses[fold] = mean((test$growth - linear.predictions)^2) kernel.fold.mses[fold] = mean((test$growth - kernel.predictions)^2) } cv.linear.mod = mean(linear.fold.mses) cv.kernel.mod = mean(kernel.fold.mses)