## SOLUTION CODE BY ZACH spring 2011, 402 Shalizi #### Problem 1 #### Problem 2 # Get the data: gmp.data = read.csv(file = "http://www.stat.cmu.edu/~cshalizi/402/hw/06/gmp_2006.csv") pcgmp.data = read.csv(file = "http://www.stat.cmu.edu/~cshalizi/402/hw/06/pcgmp_2006.csv") # Using summary(gmp.data) helps to show that X2006 is the column of interest # Check that both data sets are ordered the same way: sum(gmp.data$Metropolitan != pcgmp.data$Metropolitan) # The N, in millions: Divide gdp by gdp-per-capita N = gmp.data$X2006/pcgmp.data$X2006 # The N, corrected (since gdp is in millions) N = N*1000000 # Observe that the result does not match what Cosma says it should: summary(N) # Compute variance of log y y = pcgmp.data$X2006 # same as Y/N log.y = log(y) var(log.y) #### Problem 3 # Compute log population: log.N = log(N) # Fit a linear model lm.3 = lm(log.y ~ log.N) # The output of summary(lm.3) includes the following # Compute MSE: mean(lm.3$residuals^2) #### Problem 4 pdf("graphics/p4.pdf") plot(N, y, main = "The fitted power-law curve", cex = 0.4, lwd = 2, pch = 16) # The fitted values are the exponentials of the log fit: y.fit = exp(fitted(lm.3)) fitted.points = cbind(N,y.fit) fitted.points = fitted.points[order(N),] lines(fitted.points, lwd = 2, col = 2) dev.off() #### Problem 5 # For the nonparametric fit we will use the function smooth.spline, # letting R use a default process to choose the bandwidth (or # number of knots). We could choose to do spline.mod = smooth.spline(log.y~log.N) spline.fit = predict(spline.mod, log.N) # MSE: mean((spline.fit$y - log.y)^2) # comparison: plotting may help dim(reg.fit$eval) plot(N, y) fitted.points = cbind(exp(spline.fit$x),exp(spline.fit$y)) fitted.points = fitted.points[order(fitted.points[,1]),] lines(fitted.points, lwd = 2, col = 2) #### Problem 6 # We have completed steps 1-3 in the method outlined # in lecture 10 section 1 # Step 4: How much bigger is the parametric MSE from the nonparametric MSE? t.hat = mean(lm.3$residuals^2) - mean((spline.fit$y - log.y)^2) # For steps 5-9, we need some functions: # make.fake (below) is a function that simulates # faked data based on the parametric model lm.3 # The new x's are the same as the old x's; the new y's are the lm.3 # fitted values plus Gaussian noise with variance approximated by the # in-sample MSE make.fake = function(){ fake.log.y = fitted(lm.3) + + rnorm(length(log.y), mean = 0, sd = sqrt(mean(lm.3$residuals^2))) return(fake.log.y) } # This function estimates the parametric model on the simulated data: sim.parametric = function(fake.log.y){ lm.on.fake = lm(fake.log.y ~ log.N) MSE = mean(lm.on.fake$residuals^2) return(MSE) } # This function estimates the nonparametric model on the simulated data: sim.non.parametric = function(fake.log.y){ spline.mod = smooth.spline(fake.log.y~log.N) spline.fit = predict(spline.mod, log.N) MSE = mean((spline.fit$y - fake.log.y)^2) return(MSE) } # Use the functions above to simulate t.hat as T.hat sim.T.hat = function(){ fake.log.y = make.fake() MSE.parametric = sim.parametric(fake.log.y) MSE.non.parametric = sim.non.parametric(fake.log.y) T.hat = MSE.parametric - MSE.non.parametric return(T.hat) } # Produce 1000 T.hat replicates T.hat.replicates = replicate(1000, sim.T.hat()) # The p-value (1+sum(T.hat.replicates > t.hat))/(1+length(T.hat.replicates))