# In-class demos for 2015-09-17, 36-401B
# Load the data set from the last homework
library(gamair)
data(chicago)
# Plot deaths each day vs. temperature
plot(death ~ tmpd, data=chicago,
xlab="Temperature (Farenheit)",
ylab="Mortality (deaths/day)")
# Fit the linear model by least squares
# Look carefully at the output in the terminal
lm(death ~ tmpd, data=chicago)
# Add it to the current plot
abline(lm(death ~ tmpd, data=chicago),lwd=4,
col="red")
# lm() actually returns a complicated object;
# store it
death.temp.lm <- lm(death ~ tmpd, data=chicago)
# see some of the stuff it contains
names(death.temp.lm)
# Get the vector of coefficients
coefficients(death.temp.lm)
# Access by position
coefficients(death.temp.lm)[1]
# Access by name
# Why doesn't this give the same number as the last command?
coefficients(death.temp.lm)["tmpd"]
# Look at the fitted values, i.e., estimates
# of conditional mean
# Use head() to see only the beginning of a vector
head(fitted(death.temp.lm))
# Plot fitted values against temperature
# Why is this _necessarily_ a straight line?
plot(chicago$tmpd, fitted(death.temp.lm),
xlab="Temperature (F)",
ylab="Prediction (deaths/day)")
# Plot actual deaths against fitted values
# Why is this _not_ always a straight line?
plot(fitted(death.temp.lm), chicago$death,
xlab="Prediction (deaths/day)",
ylab="Mortality (deaths/day)")
# Add the y=x line to this plot
# How should the point be related to this line if
# the model is predicting well?
abline(0,1,lwd=4,col="grey")
# Re-do the previous plot but for prediction errors
# rather than predictions
plot(chicago$tmpd,
chicago$death-fitted(death.temp.lm),
xlab="Temperature (F)",
ylab="Prediction error (deaths/day)")
# Add a horizontal line at zero
# Why that line?
abline(h=0,lwd=4,col="grey")
# Short-cut for difference between real and
# fitted values: residuals()
head(residuals(death.temp.lm))
# Re-do last plot
plot(chicago$tmpd, residuals(death.temp.lm),
xlab="Temperature (F)",
ylab="Prediction error (deaths/day)")
# What should that look like, according to the model?
# Or this?
plot(fitted(death.temp.lm), residuals(death.temp.lm),
xlab="Prediction (deaths/day)",
ylab="Prediction error (deaths/day)")
# Or this?
plot(chicago$time, residuals(death.temp.lm),
xlab="Mean temperature (F)",
ylab="Prediction error (deaths/day)")
# Or this?
plot(chicago$tmpd, residuals(death.temp.lm)^2,
log="y",xlab="Temperature (F)",
ylab="Squared prediction error (deaths/day)^2")
# previous with a log vertical scale for clarity
plot(chicago$tmpd, residuals(death.temp.lm)^2,
log="y",xlab="Temperature (F)",
ylab="Squared prediction error (deaths/day)^2")
# What about this?
plot(hist(residuals(death.temp.lm)[-c(5,10)],breaks=40),
freq=FALSE,
xlab="Residual (deaths/day)", main="")
curve(dnorm(x,mean=0,sd=sd(residuals(death.temp.lm))),
add=TRUE,col="blue",lwd=2)
# Make predictions at new points
predict(death.temp.lm,
newdata=data.frame(tmpd=c(60,65)))
# Note: newdata not data
# Note: needs to have a column matching the
# column name used to estimate model
# Note: other columns are OK
neo.chicago <- chicago
neo.chicago$tmpd <- neo.chicago$tmpd + 2*1.8
head(predict(death.temp.lm,
newdata=neo.chicago))
# compare
head(fitted(death.temp.lm))
# Change:
delta <- predict(death.temp.lm,newdata=neo.chicago) -
fitted(death.temp.lm)
mean(delta)*365.25 # Why?
# Alternative:
coefficients(death.temp.lm)["tmpd"]*2*1.8*365.25
# Why are these equal?