# In-class demos for 2015-09-17, 36-401B

# Load the data set from the last homework
library(gamair)
data(chicago)

# Plot deaths each day vs. temperature
plot(death ~ tmpd, data=chicago,
     xlab="Temperature (Farenheit)",
     ylab="Mortality (deaths/day)")

# Fit the linear model by least squares
  # Look carefully at the output in the terminal
lm(death ~ tmpd, data=chicago)
# Add it to the current plot
abline(lm(death ~ tmpd, data=chicago),lwd=4,
       col="red")

# lm() actually returns a complicated object;
# store it
death.temp.lm <- lm(death ~ tmpd, data=chicago)

# see some of the stuff it contains
names(death.temp.lm)

# Get the vector of coefficients
coefficients(death.temp.lm)
# Access by position
coefficients(death.temp.lm)[1]
# Access by name
  # Why doesn't this give the same number as the last command?
coefficients(death.temp.lm)["tmpd"]

# Look at the fitted values, i.e., estimates
# of conditional mean
  # Use head() to see only the beginning of a vector
head(fitted(death.temp.lm))
# Plot fitted values against temperature
  # Why is this _necessarily_ a straight line?
plot(chicago$tmpd, fitted(death.temp.lm),
     xlab="Temperature (F)",
     ylab="Prediction (deaths/day)")
# Plot actual deaths against fitted values
  # Why is this _not_ always a straight line?
plot(fitted(death.temp.lm), chicago$death,
     xlab="Prediction (deaths/day)",
     ylab="Mortality (deaths/day)")
# Add the y=x line to this plot
  # How should the point be related to this line if
  # the model is predicting well?
abline(0,1,lwd=4,col="grey")
# Re-do the previous plot but for prediction errors
# rather than predictions
plot(chicago$tmpd,
     chicago$death-fitted(death.temp.lm),
     xlab="Temperature (F)",
     ylab="Prediction error (deaths/day)")
# Add a horizontal line at zero
  # Why that line?
abline(h=0,lwd=4,col="grey")

# Short-cut for difference between real and
# fitted values: residuals()
head(residuals(death.temp.lm))
# Re-do last plot
plot(chicago$tmpd, residuals(death.temp.lm),
     xlab="Temperature (F)",
     ylab="Prediction error (deaths/day)")
# What should that look like, according to the model?
# Or this?
plot(fitted(death.temp.lm), residuals(death.temp.lm),
     xlab="Prediction (deaths/day)",
     ylab="Prediction error (deaths/day)")
# Or this?
plot(chicago$time, residuals(death.temp.lm),
     xlab="Mean temperature (F)",
     ylab="Prediction error (deaths/day)")
# Or this?
plot(chicago$tmpd, residuals(death.temp.lm)^2,
     log="y",xlab="Temperature (F)",
     ylab="Squared prediction error (deaths/day)^2")
# previous with a log vertical scale for clarity
plot(chicago$tmpd, residuals(death.temp.lm)^2,
     log="y",xlab="Temperature (F)",
     ylab="Squared prediction error (deaths/day)^2")
# What about this?
plot(hist(residuals(death.temp.lm)[-c(5,10)],breaks=40),
     freq=FALSE,
     xlab="Residual (deaths/day)", main="")
curve(dnorm(x,mean=0,sd=sd(residuals(death.temp.lm))),
      add=TRUE,col="blue",lwd=2)

# Make predictions at new points
predict(death.temp.lm,
        newdata=data.frame(tmpd=c(60,65)))
# Note: newdata not data
# Note: needs to have a column matching the
  # column name used to estimate model
# Note: other columns are OK
neo.chicago <- chicago
neo.chicago$tmpd <- neo.chicago$tmpd + 2*1.8
head(predict(death.temp.lm,
             newdata=neo.chicago))
# compare
head(fitted(death.temp.lm))
# Change:
delta <- predict(death.temp.lm,newdata=neo.chicago) -
  fitted(death.temp.lm)
mean(delta)*365.25 # Why?
# Alternative:
coefficients(death.temp.lm)["tmpd"]*2*1.8*365.25
# Why are these equal?