# In-class demos for 2015-09-17, 36-401B # Load the data set from the last homework library(gamair) data(chicago) # Plot deaths each day vs. temperature plot(death ~ tmpd, data=chicago, xlab="Temperature (Farenheit)", ylab="Mortality (deaths/day)") # Fit the linear model by least squares # Look carefully at the output in the terminal lm(death ~ tmpd, data=chicago) # Add it to the current plot abline(lm(death ~ tmpd, data=chicago),lwd=4, col="red") # lm() actually returns a complicated object; # store it death.temp.lm <- lm(death ~ tmpd, data=chicago) # see some of the stuff it contains names(death.temp.lm) # Get the vector of coefficients coefficients(death.temp.lm) # Access by position coefficients(death.temp.lm)[1] # Access by name # Why doesn't this give the same number as the last command? coefficients(death.temp.lm)["tmpd"] # Look at the fitted values, i.e., estimates # of conditional mean # Use head() to see only the beginning of a vector head(fitted(death.temp.lm)) # Plot fitted values against temperature # Why is this _necessarily_ a straight line? plot(chicago$tmpd, fitted(death.temp.lm), xlab="Temperature (F)", ylab="Prediction (deaths/day)") # Plot actual deaths against fitted values # Why is this _not_ always a straight line? plot(fitted(death.temp.lm), chicago$death, xlab="Prediction (deaths/day)", ylab="Mortality (deaths/day)") # Add the y=x line to this plot # How should the point be related to this line if # the model is predicting well? abline(0,1,lwd=4,col="grey") # Re-do the previous plot but for prediction errors # rather than predictions plot(chicago$tmpd, chicago$death-fitted(death.temp.lm), xlab="Temperature (F)", ylab="Prediction error (deaths/day)") # Add a horizontal line at zero # Why that line? abline(h=0,lwd=4,col="grey") # Short-cut for difference between real and # fitted values: residuals() head(residuals(death.temp.lm)) # Re-do last plot plot(chicago$tmpd, residuals(death.temp.lm), xlab="Temperature (F)", ylab="Prediction error (deaths/day)") # What should that look like, according to the model? # Or this? plot(fitted(death.temp.lm), residuals(death.temp.lm), xlab="Prediction (deaths/day)", ylab="Prediction error (deaths/day)") # Or this? plot(chicago$time, residuals(death.temp.lm), xlab="Mean temperature (F)", ylab="Prediction error (deaths/day)") # Or this? plot(chicago$tmpd, residuals(death.temp.lm)^2, log="y",xlab="Temperature (F)", ylab="Squared prediction error (deaths/day)^2") # previous with a log vertical scale for clarity plot(chicago$tmpd, residuals(death.temp.lm)^2, log="y",xlab="Temperature (F)", ylab="Squared prediction error (deaths/day)^2") # What about this? plot(hist(residuals(death.temp.lm)[-c(5,10)],breaks=40), freq=FALSE, xlab="Residual (deaths/day)", main="") curve(dnorm(x,mean=0,sd=sd(residuals(death.temp.lm))), add=TRUE,col="blue",lwd=2) # Make predictions at new points predict(death.temp.lm, newdata=data.frame(tmpd=c(60,65))) # Note: newdata not data # Note: needs to have a column matching the # column name used to estimate model # Note: other columns are OK neo.chicago <- chicago neo.chicago$tmpd <- neo.chicago$tmpd + 2*1.8 head(predict(death.temp.lm, newdata=neo.chicago)) # compare head(fitted(death.temp.lm)) # Change: delta <- predict(death.temp.lm,newdata=neo.chicago) - fitted(death.temp.lm) mean(delta)*365.25 # Why? # Alternative: coefficients(death.temp.lm)["tmpd"]*2*1.8*365.25 # Why are these equal?