# Solutions code for midterm exam 1, 36-402, spring 2011 # Read in the data gmp = read.csv(file = "gmp-2006.csv") ######## Problem 1 ############ ## Here is a function for adding spline fits to the desired plots add.spline.fit = function(x.name = "pop", y.name = "finance"){ # Remove rows of data with NA y-values relevant.data = na.omit(gmp[,c(x.name,y.name)]) # note selecting just the columns needed for the plot: # relevant.data will have two columns, and x will come first # Fit the spline (defaults to using generalized cross validation to pick # the amount of smoothing) my.spline = smooth.spline(x=relevant.data[,1],y=relevant.data[,2]) # Add to the current plot lines(my.spline, lwd = 2, col = 2) # The "$x" attribute of my.spline has the unique values of x IN ORDER # while the "$y" has the corresponding fitted values; since the lines() # function looks for either separate x and y arguments, or one argument with # components named x and y, this works. } # Open up a 4-pane plotting frame ### This is somewhat repetitive and could be automated par(mfrow = c(2,2)) plot(gmp$pop, gmp$finance, cex=0.4, lwd=1.4, pch=16, main = "Finance", xlab = "Population", ylab = "Share",log="x") add.spline.fit(y.name = "finance") plot(gmp$pop, gmp$professional.and.technical, cex=0.4, lwd=1.4, pch=16, main = "Professional and technical services", xlab = "Population", ylab = "Share",log="x") add.spline.fit(y.name = "professional.and.technical") plot(gmp$pop, gmp$ict, cex=0.4, lwd=1.4, pch=16, main = "Information technology", xlab = "Population", ylab = "Share",log="x") add.spline.fit(y.name = "ict") plot(gmp$pop, gmp$management, cex=0.4, lwd=1.4, pch=16, main = "Management", xlab = "Population", ylab = "Share",log="x") add.spline.fit(y.name = "management") ############ Problem 2 ############### par(mfrow = c(2,2)) plot(gmp$finance, gmp$pcgmp, cex=0.4, lwd=1.4, pch=16, main = "pcgmp vs finance share", xlab = "Share", ylab = "pcgmp") add.spline.fit(x.name = "finance", y.name = "pcgmp") plot(gmp$professional.and.technical, gmp$pcgmp, cex=0.4, lwd=1.4, pch=16, main = "pcgmp vs professional share", xlab = "Share", ylab = "pcgmp") add.spline.fit(x.name = "professional.and.technical", y.name = "pcgmp") plot(gmp$ict, gmp$pcgmp, cex=0.4, lwd=1.4, pch=16, main = "pcgmp vs ict share", xlab = "Share", ylab = "pcgmp") add.spline.fit(x.name = "ict", y.name = "pcgmp") plot(gmp$management, gmp$pcgmp, cex=0.4, lwd=1.4, pch=16, main = "pcgmp vs management share", xlab = "Share", ylab = "pcgmp") add.spline.fit(x.name = "management", y.name = "pcgmp") ######## Problem 3 ############### # Fit the log-additive model library(mgcv) gam.3 = gam(log(pcgmp) ~ log(pop) + s(finance) + s(professional.and.technical) + s(ict) + s(management), data = gmp) # Look at it to get the intercept and the parametric coefficient summary(gam.3) # Residual standard error: sd(gam.3$residuals) # Equivalently sqrt(mean(residuals(gam.3)^2)) # Plot the four smoothed partial response functions par(mfrow = c(2,2)) plot(gam.3,seWithMean=TRUE) # Alternately par(mfrow=c(1,1)) # Reset to a single plot window plot(gam.3,pages=1,seWithMean=TRUE) # Examine the residuals par(mfrow = c(2,1)) # qqplot for the distribution of the residuals qqnorm(residuals(gam.3)) qqline(residuals(gam.3)) # Any obvious trends vs. the fitted values? plot(fitted(gam.3), residuals(gam.3), main = "Residuals vs. fitted values", xlab = "Fitted values", ylab = "Residuals") rug(fitted(gam.3)) abline(h= 0) ############### Question 4 ############### # How much does increasing population by 10% change things in the power law # model? # Fit the power law: lm.p.law = lm(log(pcgmp) ~ log(pop), data = gmp) # Get the coefficient b: coefficients(lm.p.law)[2] # = 0.124 # What are the prediction for current populations? old.p.law.preds <- predict(lm.p.law,newdata=gmp) # Predict on a new data set with larger population new.p.law.preds <- predict(lm.p.law,newdata=data.frame(pop=1.1*gmp$pop)) # Alternatively, create a whole new data frame with all the columns, but # just change population gmp.plus.10 <- gmp gmp.plus.10$pop <- 1.1*gmp.plus.10$pop new.p.law.preds <- predict(lm.p.law,newdata=gmp.plus.10) # Average change mean(new.p.law.preds - old.p.law.preds) # Alternative to directly doing predictions which uses the model # specification (see solutions for why this works): log(1.1) * coefficients(lm.p.law)[2] # Same answer # Repeat this with the additive model old.lam.preds <- predict(gam.3) # Gives fitted values without newdata new.lam.preds <- predict(gam.3,newdata=na.omit(gmp.plus.10)) # Mean change: mean(new.lam.preds - old.lam.preds) # Same trick works again: log(1.1)*coefficients(gam.3)["log(pop)"] ######## Problem 5 ############## # In-sample MSEs for the two models mean(residuals(gam.3)^2) mean(residuals(lm.p.law)^2) # Compare them by cross-validation, with bootstrapping from the power law model # to assess significance # Create a clean (no-NA) version of the data gmp.clean = na.omit(gmp) # CV comparison of MSEs do.nfold.cv = function(nfolds = 5, data=gmp.clean){ # make test vs train labels case.folds = rep(1:nfolds,length.out=nrow(data)) case.folds = sample(case.folds) fold.mses.gam = rep(0,nfolds) fold.mses.lm = fold.mses.gam for (fold in 1:nfolds) { # Split data into training and test data train = data[case.folds!=fold,] test = data[case.folds==fold,] # Fit the gam model: my.gam = gam(log(pcgmp) ~ log(pop) + s(finance) + s(professional.and.technical) + s(ict) + s(management), data = train) # Fit the old model: my.lm = lm(log(pcgmp) ~ log(pop), data = train) # Predict on the test set predictions.gam = predict(my.gam, newdata=test) predictions.lm = predict(my.lm, newdata=test) # We want to compare predictions from both models against the truth truth = log(test$pcgmp) # gam MSE: fold.mses.gam[fold] = mean((truth - predictions.gam)^2) # power-law MSE: fold.mses.lm[fold] = mean((truth - predictions.lm)^2) } # Report an estimate of how much the power-law model MSE exceeds # the power-law MSE: return(mean(fold.mses.gam) - mean(fold.mses.lm)) } # Run on the data observed.cv <- do.nfold.cv(data=gmp.clean) observed.cv # Simulate from the power law model, using resampling of residuals simulate.power.law <- function(data=gmp.clean,model=lm.p.law) { n <- nrow(data) fitted.pcgmp <- predict(model,newdata=data) noise <- sample(residuals(model),size=n,replace=TRUE) # Or use the resample() command defined in the lecture notes data$pcgmp <- exp(fitted.pcgmp + noise) return(data) } # Create the bootstrap replicates (note: 1000 replicates takes 8.5 minutes on # my laptop, use a smaller value for debugging) bootstrap.cv.mses <- replicate(100,do.nfold.cv(data=simulate.power.law())) # p-value (1+sum(bootstrap.cv.mses < observed.cv))/(1+length(bootstrap.cv.mses)) # Sanity check that: summary(bootstrap.cv.mses)