## ----echo=FALSE----------------------------------------------------------
library(knitr)
opts_chunk$set(size="small",background="white", highlight=FALSE, cache=TRUE, autdep=TRUE, tidy=TRUE)

## ----initial-data-plot, echo=FALSE---------------------------------------
# Load the data set from the last homework
library(gamair)
data(chicago)

# Plot deaths each day vs. temperature
plot(death ~ tmpd, data=chicago,
     xlab="Temperature (Farenheit)",
     ylab="Mortality (deaths/day)")

# Estimate and store a linear model
death.temp.lm <- lm(death ~ tmpd, data=chicago)
abline(death.temp.lm, col="blue")

## ----eval=FALSE----------------------------------------------------------
## # Load the data set from the last homework
## library(gamair)
## data(chicago)
## 
## # Plot deaths each day vs. temperature
## plot(death ~ tmpd, data=chicago,
##      xlab="Temperature (Farenheit)",
##      ylab="Mortality (deaths/day)")
## 
## # Estimate and store a linear model
## death.temp.lm <- lm(death ~ tmpd, data=chicago)
## abline(death.temp.lm, col="blue")

## ----residuals-vs-predictor, echo=FALSE----------------------------------
# Always plot residuals vs. predictor variable
plot(chicago$tmpd, residuals(death.temp.lm),
     xlab="Temperature (F)",
     ylab="Prediction error (deaths/day)")
abline(h=0, col="grey")

## ----eval=FALSE----------------------------------------------------------
## # Always plot residuals vs. predictor variable
## plot(chicago$tmpd, residuals(death.temp.lm),
##      xlab="Temperature (F)",
##      ylab="Prediction error (deaths/day)")
## abline(h=0, col="grey")

## ----residuals-sq-vs-temperature, echo=FALSE-----------------------------
# Always plot residuals^2 vs.
# predictor variable
plot(chicago$tmpd, residuals(death.temp.lm)^2,
     xlab="Temperature (F)",
     ylab="Squared prediction error (deaths/day)^2")
abline(h=mean(residuals(death.temp.lm)^2),
       lwd=2, col="grey")

## ----eval=FALSE----------------------------------------------------------
## # Always plot residuals^2 vs.
## # predictor variable
## plot(chicago$tmpd, residuals(death.temp.lm)^2,
##      xlab="Temperature (F)",
##      ylab="Squared prediction error (deaths/day)^2")
## abline(h=mean(residuals(death.temp.lm)^2),
##        lwd=2, col="grey")

## ----abs-residuals-vs-temperature, echo=FALSE----------------------------
plot(chicago$tmpd, abs(residuals(death.temp.lm)),
     xlab="Temperature (F)",
     ylab="Absolute prediction error (deaths/day)")
abline(h=sqrt(mean(residuals(death.temp.lm)^2)),
       lwd=2, col="grey")

## ----eval=FALSE----------------------------------------------------------
## plot(chicago$tmpd, abs(residuals(death.temp.lm)),
##      xlab="Temperature (F)",
##      ylab="Absolute prediction error (deaths/day)")
## abline(h=sqrt(mean(residuals(death.temp.lm)^2)),
##        lwd=2, col="grey")

## ----residuals-vs-date, echo=FALSE---------------------------------------
# Always plot residuals vs. coordinate
plot(as.Date(chicago$time,origin="1993-12-31"),
     residuals(death.temp.lm), xlab="Date",
     ylab="Prediction error (deaths/day)")

## ----eval=FALSE----------------------------------------------------------
## # Always plot residuals vs. coordinate
## plot(as.Date(chicago$time,origin="1993-12-31"),
##      residuals(death.temp.lm), xlab="Date",
##      ylab="Prediction error (deaths/day)")

## ----eval=FALSE----------------------------------------------------------
## sample(residuals(my.model))

## ----residual-vs-residual, echo=FALSE------------------------------------
# Always plot successive residuals against each other
  # head() and tail() here are used to get "every day except the last"
  # and "every day except the first", respectively
  # see help(head)
plot(head(residuals(death.temp.lm),-1),
     tail(residuals(death.temp.lm),-1),
     xlab="Residual today",
     ylab="Residual tomorrow")
abline(lm(tail(residuals(death.temp.lm),-1) ~ head(residuals(death.temp.lm),-1)))

## ----eval=FALSE----------------------------------------------------------
## # Always plot successive residuals against each other
##   # head() and tail() here are used to get "every day except the last"
##   # and "every day except the first", respectively
##   # see help(head)
## plot(head(residuals(death.temp.lm),-1),
##      tail(residuals(death.temp.lm),-1),
##      xlab="Residual today",
##      ylab="Residual tomorrow")
## abline(lm(tail(residuals(death.temp.lm),-1) ~ head(residuals(death.temp.lm),-1)))

## ----histogram-of-residuals, echo=FALSE----------------------------------
# Always plot distribution of residuals
hist(residuals(death.temp.lm),breaks=40,
     freq=FALSE,
     xlab="Residual (deaths/day)", main="")
curve(dnorm(x,mean=0,sd=sd(residuals(death.temp.lm))),
      add=TRUE,col="blue")

## ----eval=FALSE----------------------------------------------------------
## # Always plot distribution of residuals
## hist(residuals(death.temp.lm),breaks=40,
##      freq=FALSE,
##      xlab="Residual (deaths/day)", main="")
## curve(dnorm(x,mean=0,sd=sd(residuals(death.temp.lm))),
##       add=TRUE,col="blue")

## ----qqplot-of-residuals, echo=FALSE-------------------------------------
# An alternative: plot vs. theoretical Gaussian distribution
qqnorm(residuals(death.temp.lm))
qqline(residuals(death.temp.lm))

## ----eval=FALSE----------------------------------------------------------
## # An alternative: plot vs. theoretical Gaussian distribution
## qqnorm(residuals(death.temp.lm))
## qqline(residuals(death.temp.lm))

## ----set-up-cv-----------------------------------------------------------
# Always look at whether the model can extrapolate to
# new data
# Basic check: randomly divide into two parts, here say 90% of the data
# vs. 10%
# Use the "training set" to estimate the model
training.rows <- sample(1:nrow(chicago), size=round(nrow(chicago)*0.9),
                        replace=FALSE)
training.set <- chicago[training.rows,]
# We'll use the "testing set" to see how well it does
testing.set <- chicago[-training.rows,]
# Estimate the model on the training set only
training.lm <- lm(death ~ tmpd, data=training.set)
# Make predictions on the testing set
  # The model didn't get to see these points while it was being
  # estimated, so this really checks (or tests) whether it can
  # predict
testing.preds <- predict(training.lm, newdata=testing.set)
# Unfortunately residuals() doesn't know about the new data set
# so calculate the residuals by hand
testing.residuals <- testing.set$death-testing.preds

## ----cv-residuals-plot,echo=FALSE----------------------------------------
# Plot our residuals against the predictor variable
plot(testing.set$tmpd, testing.residuals,
     xlab="Daily mean temperature (F)",
     ylab="Prediction error (deaths/day)",
     main="Out-of-sample residuals")
abline(h=0,col="grey")
abline(h=mean(testing.residuals),col="red")

## ----eval=FALSE----------------------------------------------------------
## # Plot our residuals against the predictor variable
## plot(testing.set$tmpd, testing.residuals,
##      xlab="Daily mean temperature (F)",
##      ylab="Prediction error (deaths/day)",
##      main="Out-of-sample residuals")
## abline(h=0,col="grey")
## abline(h=mean(testing.residuals),col="red")

## ----cv-residuals-sq-plot, echo=FALSE------------------------------------
# Plot absolute residuals vs. predictor variable
plot(testing.set$tmpd, abs(testing.residuals),
     xlab="Daily mean temperature (F)",
     ylab="Absolute prediction error (deaths/day)",
     main="Out-of-sample absolute residuals")
abline(h=sqrt(mean(residuals(training.lm)^2)),col="grey")
abline(h=sqrt(mean(testing.residuals^2)),col="red")

## ----eval=FALSE----------------------------------------------------------
## # Plot absolute residuals vs. predictor variable
## plot(testing.set$tmpd, abs(testing.residuals),
##      xlab="Daily mean temperature (F)",
##      ylab="Absolute prediction error (deaths/day)",
##      main="Out-of-sample absolute residuals")
## abline(h=sqrt(mean(residuals(training.lm)^2)),col="grey")
## abline(h=sqrt(mean(testing.residuals^2)),col="red")

## ----cv-by-temp----------------------------------------------------------
# Find the low-temperature days
lowtemp.rows <- which(chicago$tmpd < 75)  # About 90% of the data
# Divide into low- and high- temperature data sets
lowtemp.set <- chicago[lowtemp.rows,]
hightemp.set <- chicago[-lowtemp.rows,]
# Estimate the model on the colder days only
lowtemp.lm <- lm(death ~ tmpd, data=lowtemp.set)
   # For you: how much do the parameters change, as compared to
   # using all the data?
# Now predict on the high-temperature days
  # Again, these are new data points, but now systematically
  # different (because of their temperature) from the
  # data used to estimate
hightemp.preds <- predict(lowtemp.lm, newdata=hightemp.set)
# Calculate our own residuals
hightemp.residuals <- hightemp.set$death-hightemp.preds

## ----cv-by-temp-plot, echo=FALSE-----------------------------------------
# Plot residuals vs. temperature
plot(hightemp.set$tmpd, hightemp.residuals,
     xlab="Daily mean temperature (F)",
     ylab="Prediction error (deaths/day)",
     main="Out-of-sample residuals")
# Flat line at 0 (ideal, if the model is right)
abline(h=0,col="grey")
# Flat line at the mean of the new residuals
abline(h=mean(hightemp.residuals),col="red")
# Regressing the new residuals on temperature does not look good...
abline(lm(hightemp.residuals ~ hightemp.set$tmpd),col="purple")

## ----eval=FALSE----------------------------------------------------------
## # Plot residuals vs. temperature
## plot(hightemp.set$tmpd, hightemp.residuals,
##      xlab="Daily mean temperature (F)",
##      ylab="Prediction error (deaths/day)",
##      main="Out-of-sample residuals")
## # Flat line at 0 (ideal, if the model is right)
## abline(h=0,col="grey")
## # Flat line at the mean of the new residuals
## abline(h=mean(hightemp.residuals),col="red")
## # Regressing the new residuals on temperature does not look good...
## abline(lm(hightemp.residuals ~ hightemp.set$tmpd),col="purple")

## ----abs-cv-by-temp-plot, echo=FALSE-------------------------------------
# Similar plots for the absolute residuals
plot(hightemp.set$tmpd, abs(hightemp.residuals),
     xlab="Daily mean temperature (F)",
     ylab="Absolute prediction error (deaths/day)",
     main="Out-of-sample absolute residuals")
abline(h=sqrt(mean(residuals(lowtemp.lm)^2)),col="grey")
abline(h=sqrt(mean(hightemp.residuals^2)),col="red")
abline(lm(abs(hightemp.residuals) ~ hightemp.set$tmpd),col="purple")

## ----eval=FALSE----------------------------------------------------------
## # Similar plots for the absolute residuals
## plot(hightemp.set$tmpd, abs(hightemp.residuals),
##      xlab="Daily mean temperature (F)",
##      ylab="Absolute prediction error (deaths/day)",
##      main="Out-of-sample absolute residuals")
## abline(h=sqrt(mean(residuals(lowtemp.lm)^2)),col="grey")
## abline(h=sqrt(mean(hightemp.residuals^2)),col="red")
## abline(lm(abs(hightemp.residuals) ~ hightemp.set$tmpd),col="purple")

## ----eval=FALSE----------------------------------------------------------
## smooth.spline(x,y,cv=TRUE)

## ----smooth.spline-demo, echo=FALSE, warning=FALSE-----------------------
plot(death ~ tmpd, data=chicago,
     xlab="Temperature (Farenheit)",
     ylab="Mortality (deaths/day)")
death.temp.ss <- smooth.spline(x=chicago$tmpd, y=chicago$death, cv=TRUE)
abline(death.temp.lm, col="blue")
lines(death.temp.ss, col="red")

## ----eval=FALSE----------------------------------------------------------
## plot(death ~ tmpd, data=chicago,
##      xlab="Temperature (Farenheit)",
##      ylab="Mortality (deaths/day)")
## death.temp.ss <- smooth.spline(x=chicago$tmpd, y=chicago$death, cv=TRUE)
## abline(death.temp.lm, col="blue")
## lines(death.temp.ss, col="red")

## ----smooth.spline-residuals, echo=FALSE, warning=FALSE------------------
plot(chicago$tmpd, residuals(death.temp.lm),
     xlab="Temperature (F)",
     ylab="Prediction error (deaths/day)")
abline(h=0, col="grey")
lines(smooth.spline(x=chicago$tmpd, y=residuals(death.temp.lm), cv=TRUE),
      col="red")

## ----eval=FALSE----------------------------------------------------------
## plot(chicago$tmpd, residuals(death.temp.lm),
##      xlab="Temperature (F)",
##      ylab="Prediction error (deaths/day)")
## abline(h=0, col="grey")
## lines(smooth.spline(x=chicago$tmpd, y=residuals(death.temp.lm), cv=TRUE),
##       col="red")

## ----smooth.spline-on-sq-residuals, echo=FALSE, warning=FALSE------------
plot(chicago$tmpd, residuals(death.temp.lm)^2, log="y",
     xlab="Temperature (F)",
     ylab="Squared prediction error (deaths/day)^2")
abline(h=mean(residuals(death.temp.lm)^2),
       lwd=2, col="grey")
lines(smooth.spline(x=chicago$tmpd, y=residuals(death.temp.lm)^2,
                    cv=TRUE),
      col="red")

## ----eval=FALSE----------------------------------------------------------
## plot(chicago$tmpd, residuals(death.temp.lm)^2, log="y",
##      xlab="Temperature (F)",
##      ylab="Squared prediction error (deaths/day)^2")
## abline(h=mean(residuals(death.temp.lm)^2),
##        lwd=2, col="grey")
## lines(smooth.spline(x=chicago$tmpd, y=residuals(death.temp.lm)^2,
##                     cv=TRUE),
##       col="red")

## ----boxcox-of-established-model, echo=FALSE-----------------------------
library(MASS)
# Works with a previously-fit lm model
boxcox(death.temp.lm)

## ----eval=FALSE----------------------------------------------------------
## library(MASS)
## # Works with a previously-fit lm model
## boxcox(death.temp.lm)

## ----box-cox-on-new-model------------------------------------------------
# You can also give a model formula, and suppress plotting
bc.death.temp <- boxcox(death ~ tmpd, data=chicago, plotit=FALSE)
# Result is a list showing lambda values ($x component) vs. log-likelihood
# (the $y component)
# We can use this to get a (rough) maximum value for lambda
(lambda.hat <- bc.death.temp$x[which.max(bc.death.temp$y)])

## ----bc-transf-lm, echo=FALSE--------------------------------------------
chicago$bc.death <- (chicago$death^lambda.hat-1)/lambda.hat
bc.lm <- lm(bc.death ~ tmpd, data=chicago)
plot(death ~ tmpd, data=chicago,
     xlab="Temperature (Farenheit)",
     ylab="Mortality (deaths/day)")
points(chicago[,"tmpd"], (lambda.hat*fitted(bc.lm)+1)^(1/lambda.hat), pch=19,
       col="blue")

## ----eval=FALSE----------------------------------------------------------
## chicago$bc.death <- (chicago$death^lambda.hat-1)/lambda.hat
## bc.lm <- lm(bc.death ~ tmpd, data=chicago)
## plot(death ~ tmpd, data=chicago,
##      xlab="Temperature (Farenheit)",
##      ylab="Mortality (deaths/day)")
## points(chicago[,"tmpd"], (lambda.hat*fitted(bc.lm)+1)^(1/lambda.hat), pch=19,
##        col="blue")