# In-class demos for 2015-09-22, 36-401B # Load the data set from the last homework library(gamair) data(chicago) # Plot deaths each day vs. temperature plot(death ~ tmpd, data=chicago, xlab="Temperature (Farenheit)", ylab="Mortality (deaths/day)") # Estimate and store a linear model death.temp.lm <- lm(death ~ tmpd, data=chicago) # Diagnostics # Always plot residuals vs. predictor variable plot(chicago\$tmpd, residuals(death.temp.lm), xlab="Temperature (F)", ylab="Prediction error (deaths/day)") abline(h=0,lwd=2,col="grey") # Always plot residuals^2 and/or abs(residuals) vs. # predictor variable plot(chicago\$tmpd, residuals(death.temp.lm)^2, xlab="Temperature (F)", ylab="Squared prediction error (deaths/day)^2") abline(h=mean(residuals(death.temp.lm)^2), lwd=2, col="grey") plot(chicago\$tmpd, abs(residuals(death.temp.lm)), xlab="Temperature (F)", ylab="Absolute prediction error (deaths/day)") abline(h=sqrt(mean(residuals(death.temp.lm)^2)), lwd=2, col="grey") # Always plot residuals vs. measurement plot(as.Date(chicago\$time,origin="1993-12-31"), residuals(death.temp.lm), xlab="Date", ylab="Prediction error (deaths/day)") # Always plot successive residuals against each other # head() and tail() here are used to get "every day except the last" # and "every day except the first", respectively # see help(head) plot(head(residuals(death.temp.lm),-1), tail(residuals(death.temp.lm),-1), xlab="Residual today", ylab="Residual tomorrow") # Always plot distribution of residuals plot(hist(residuals(death.temp.lm),breaks=40), freq=FALSE, xlab="Residual (deaths/day)", main="") curve(dnorm(x,mean=0,sd=sd(residuals(death.temp.lm))), add=TRUE,col="blue",lwd=2) # An alternative: plot vs. theoretical Gaussian distribution qqnorm(residuals(death.temp.lm)) qqline(residuals(death.temp.lm)) # Always look at whether the model can extrapolate to # new data # Basic check: randomly divide into two parts, here say 90% of the data # vs. 10% # Use the "training set" to estimate the model training.rows <- sample(1:nrow(chicago), size=round(nrow(chicago)*0.9), replace=FALSE) training.set <- chicago[training.rows,] # We'll use the "testing set" to see how well it does testing.set <- chicago[-training.rows,] # Estimate the model on the training set only training.lm <- lm(death ~ tmpd, data=training.set) # Make predictions on the testing set # The model didn't get to see these points while it was being # estimated, so this really checks (or tests) whether it can # predict testing.preds <- predict(training.lm, newdata=testing.set) # Unfortunately residuals() doesn't know about the new data set # so calculate the residuals by hand testing.residuals <- testing.set\$death-testing.preds # Plot our residuals against the predictor variable plot(testing.set\$tmpd, testing.residuals, xlab="Daily mean temperature (F)", ylab="Prediction error (deaths/day)", main="Out-of-sample residuals") abline(h=0,col="grey",lwd=2) abline(h=mean(testing.residuals),col="red",lwd=2) # Plot absolute residuals vs. predictor variable plot(testing.set\$tmpd, abs(testing.residuals), xlab="Daily mean temperature (F)", ylab="Absolute prediction error (deaths/day)", main="Out-of-sample absolute residuals") abline(h=sqrt(mean(residuals(training.lm)^2)),col="grey",lwd=2) abline(h=sqrt(mean(testing.residuals^2)),col="red",lwd=2) # The model is doing alright on data it hasn't seen before, # when the testing set is chosen at random # From before, it looks like the really bad residuals are # on high-temperature days # See if the model can extrapolate from low temperature # to high temperature --- it should be able to if the # simple linear model is right # Find the low-temperature days lowtemp.rows <- which(chicago\$tmpd < 75) # About 90% of the data # Divide into low- and high- temperature data sets lowtemp.set <- chicago[lowtemp.rows,] hightemp.set <- chicago[-lowtemp.rows,] # Estimate the model on the colder days only lowtemp.lm <- lm(death ~ tmpd, data=lowtemp.set) # For you: how much do the parameters change, as compared to # using all the data? # Now predict on the high-temperature days # Again, these are new data points, but now systematically # different (because of their temperature) from the # data used to estimate hightemp.preds <- predict(lowtemp.lm, newdata=hightemp.set) # Calculate our own residuals hightemp.residuals <- hightemp.set\$death-hightemp.preds # Plot residuals vs. temperature plot(hightemp.set\$tmpd, hightemp.residuals, xlab="Daily mean temperature (F)", ylab="Prediction error (deaths/day)", main="Out-of-sample residuals") # Flat line at 0 (ideal, if the model is right) abline(h=0,col="grey",lwd=2) # Flat line at the mean of the new residuals abline(h=mean(hightemp.residuals),col="red",lwd=2) # Regressing the new residuals on temperature does not look good... abline(lm(hightemp.residuals ~ hightemp.set\$tmpd),col="purple",lwd=2) # Similar plots for the absolute residuals plot(hightemp.set\$tmpd, abs(hightemp.residuals), xlab="Daily mean temperature (F)", ylab="Absolute prediction error (deaths/day)", main="Out-of-sample absolute residuals") abline(h=sqrt(mean(residuals(lowtemp.lm)^2)),col="grey",lwd=2) abline(h=sqrt(mean(hightemp.residuals^2)),col="red",lwd=2) abline(lm(abs(hightemp.residuals) ~ hightemp.set\$tmpd),col="purple",lwd=2) # A not-very-satisfactory transformation: chicago\$from.room <- abs(chicago\$tmpd - 72) plot(death ~ from.room, data=chicago, xlab="Departure from room temperature (F)", ylab="Mortality (deaths/day)") death.from.room <- lm(death ~ from.room, data=chicago) abline(death.from.room, lwd=2) # How would we check this? plot(chicago\$tmpd, residuals(death.temp.lm), xlab="Daily mean temperature (F)", ylab="Prediction error (deaths/day)", col="grey") points(chicago\$tmpd, residuals(death.from.room), col="black") # Can now go through the rest of the diagnostic checks # Why do I call this a not-very-satisfactory transformation?