# Homework Assignment 3: Old Heteroskedastic # 36-402, Data Analysis, Spring 2011 # Solutions by Z. Kurtz # Setup setwd("C:/Documents and Settings/zach/My Documents/My Dropbox/402") library(MASS) data(geyser) summary(geyser) ############################################################ # Problem 1 #pdf("hw3/graphics/p1.pdf") # Plot the data points plot(geyser$duration, geyser$waiting, cex = 0.5, pch = 16, main = "Waiting time as function of geyser duration", xlab = "Duration (minutes)", ylab = "Waiting time (minutes)") mtext("Black line = LS regression line") # Build linear model: lm.1 = lm(waiting ~ duration, data = geyser) # Add the regression line to the plot abline(lm.1) #dev.off() ############################################################ # Problem 2 #pdf("hw3/graphics/p2and3.pdf") # Plot the squared residuals against duration plot(geyser$duration, lm.1$residuals^2, cex = 0.5, pch = 16, main = "Squared residuals versus geyser duration", xlab = "Geyser duration", ylab = "Squared residuals from linear model") ############################################################ # Problem 3 # To estimate the variance, I will use the same kernel regression function # from HW Assignment 2: require(np) reg.var3 = npreg(lm.1$residuals^2 ~ geyser$duration) # Pair each observation, indexed by duration, with its fitted value fits = cbind(geyser$duration, reg.var3$mean) # Order the observation-fit pairs by duration (column 1) fits3 = fits[order(fits[,1]),] # Add the fits to the plot lines(fits3, col=2, lwd = 2.5) mtext("Red curve: kernel regression function", col = 2) #dev.off() ############################################################ # Problem 4 # Build a weighted linear model: lm.4 = lm(waiting ~ duration, data = geyser, weights = 1/reg.var3$mean) # Consider the coefficients and standard errors summary(lm.1)$coefficients summary(lm.4)$coefficients ############################################################ # Problem 5 #pdf("hw3/graphics/p5.pdf") # Do a nonparametric kernel regression of waiting on duration regression5 = npreg(waiting ~ duration, data = geyser, residuals = T) # Plot the data and then plot the regression results plot(geyser$duration, geyser$waiting, cex = 0.5, pch = 16, main = "Waiting time as function of geyser duration with a kernel regression curve", xlab = "Duration (minutes)", ylab = "Waiting time (minutes)") # Pair each observation, indexed by duration, with its fitted value fits = cbind(geyser$duration, regression5$mean) # Order the observation-fit pairs by duration (column 1) fits5 = fits[order(fits[,1]),] # Add the fits to the plot lines(fits, col=2, lwd = 2.5) #dev.off() ############################################################ # Problem 6 # Do a kernel regression of the squared residuals (from part 5) on duration reg.var6 = npreg(regression5$resid^2 ~ geyser$duration) # Pair each observation, indexed by duration, with its fitted value fits = cbind(geyser$duration, reg.var6$mean) # Order the observation-fit pairs by duration (column 1) fits6 = fits[order(fits[,1]),] #pdf("hw3/graphics/p6.pdf") # Plot the fits as a blue dashed curve plot(fits6, type = "l", col = 4, lty = 2, lwd = 2.5, xlab = "Duration (minutes)", ylab = "Estimated variance of waiting given duration", main = "Estimated variance of waiting given duration: Solid black is for unweighted linear regression, and dashed blue is for kernel regression" ) # Add to the plot the fits from part 3 lines(fits3, lwd = 2) #dev.off() ############################################################ # Problem 7 require(np) #pdf("hw3/graphics/p7.pdf") # Produce a bandwidth specification bw = npcdensbw(waiting~duration, data = geyser, tol=.1, ftol=.1) # Produce a conditional density estimate and plot it npc.dens7 = npcdens(bws = bw) npplot(bws=bw, theta = -30, phi = 30, view = "fixed", main = "Conditional density estimate for waiting given duration") #dev.off()