## ----echo=FALSE----------------------------------------------------------
library(knitr)
opts_chunk$set(size="small",background="white", highlight=FALSE, cache=TRUE, autdep=TRUE, tidy=TRUE)

## ------------------------------------------------------------------------
# Simulate a Gaussian-noise simple linear regression model
# Inputs: x sequence; intercept; slope; noise variance; switch for whether to
  # return the simulated values, or run a regression and return the estimated
  # model
# Output: data frame or coefficient vector
sim.gnslrm <- function(x, intercept, slope, sigma.sq, mdl=TRUE) {
  n <- length(x)
  y <- intercept + slope*x + rnorm(n,mean=0,sd=sqrt(sigma.sq))
  if (mdl) {
    return(lm(y~x))
  } else {
    return(data.frame(x=x, y=y))
  }
}

# Read in a model and get it to give a prediction interval at a given x
  # This will be convenient when we want to have lots of models make predictions
  # at the same point
# Inputs: the model, the value of x
# Output: vector giving prediction interval
extract.pred.int <- function(mdl,x,level=0.95) {
    predict(mdl,newdata=data.frame(x=x),interval="prediction",level=level)
}


# No magic numbers!
x.new <- 1/137
m <- 1000
alpha <- 0.05
beta.0 <- 5
beta.1 <- -2
sigma.sq <- 0.1

## ----ideal-prediction-intervals, echo=FALSE, out.height="0.5\\textheight"----
# Simulate Y from the model
y.new <- sim.gnslrm(x=rep(x.new,m),beta.0,beta.1,sigma.sq,mdl=FALSE)$y
# All the prediction intervals are the same (because x isn't changing)
pred.int <- beta.0 + beta.1*x.new + sqrt(sigma.sq)*qnorm(c(alpha/2,1-alpha/2))
names(pred.int) <- c("lwr","upr") # Names make for clearer code
par(mfrow=c(1,2)) # Set up for 2 side-by-side plots
# Plot the first 25 runs of Y (so we can see what's happening)
plot(1:25, y.new[1:25], xlab="Simulation number", ylab="Y", ylim=c(2,8))
# Add vertical segments for the prediction intervals
segments(x0=1:25, x1=1:25, y0=pred.int["lwr"], y1=pred.int["upr"], lty="dashed")
# For each Y, check if it's covered by the interval
covered <- (y.new >= pred.int["lwr"]) & (y.new <= pred.int["upr"])
# Plot the running mean of the fraction of Y's covered by the interval
plot(1:m, cumsum(covered)/(1:m), xlab="Number of simulations",
     ylab="Cumulative coverage proportion", ylim=c(0.5,1))
abline(h=1-alpha, col="grey") # Theoretical coverage level
par(mfrow=c(1,1)) # Restore ordinary plot layout for later

## ----eval=FALSE----------------------------------------------------------
## # Simulate Y from the model
## y.new <- sim.gnslrm(x=rep(x.new,m),beta.0,beta.1,sigma.sq,mdl=FALSE)$y
## # All the prediction intervals are the same (because x isn't changing)
## pred.int <- beta.0 + beta.1*x.new + sqrt(sigma.sq)*qnorm(c(alpha/2,1-alpha/2))
## names(pred.int) <- c("lwr","upr") # Names make for clearer code
## par(mfrow=c(1,2)) # Set up for 2 side-by-side plots
## # Plot the first 25 runs of Y (so we can see what's happening)
## plot(1:25, y.new[1:25], xlab="Simulation number", ylab="Y", ylim=c(2,8))
## # Add vertical segments for the prediction intervals
## segments(x0=1:25, x1=1:25, y0=pred.int["lwr"], y1=pred.int["upr"], lty="dashed")
## # For each Y, check if it's covered by the interval
## covered <- (y.new >= pred.int["lwr"]) & (y.new <= pred.int["upr"])
## # Plot the running mean of the fraction of Y's covered by the interval
## plot(1:m, cumsum(covered)/(1:m), xlab="Number of simulations",
##      ylab="Cumulative coverage proportion", ylim=c(0.5,1))
## abline(h=1-alpha, col="grey") # Theoretical coverage level
## par(mfrow=c(1,1)) # Restore ordinary plot layout for later

## ----pred-int-with-random-coefs, echo=FALSE, out.height="0.5\\textheight"----
# Run simulations where we get a new estimate of the model on each run,
# but with fixed X vector (to keep it simple)
x.seq <- seq(from=-5, to=5, length.out=42)
# Run the simulation many times, and give a _list_ of estimated models
  # simplify=FALSE forces the return value to be a list
mdls <- replicate(m,  sim.gnslrm(x=x.seq,beta.0,beta.1,sigma.sq,mdl=TRUE),
                  simplify=FALSE)
# Extract the prediction intervals for every one of the models
pred.ints <- sapply(mdls, extract.pred.int, x=x.new)
rownames(pred.ints)[2:3] <- names(pred.int) # Fix the names
# Now make plots like the previous figure
par(mfrow=c(1,2))
plot(1:25, y.new[1:25], xlab="Simulation number", ylab="Y", ylim=c(2,8))
segments(x0=1:25, x1=1:25, y0=pred.ints["lwr",], y1=pred.ints["upr",], lty="dashed")
covered <- (y.new >= pred.ints["lwr",]) & (y.new <= pred.ints["upr",])
plot(1:m, cumsum(covered)/(1:m), xlab="Number of simulations",
     ylab="Cumulative coverage proportion", ylim=c(0.5,1))
abline(h=1-alpha, col="grey")
par(mfrow=c(1,1))

## ----eval=FALSE----------------------------------------------------------
## # Run simulations where we get a new estimate of the model on each run,
## # but with fixed X vector (to keep it simple)
## x.seq <- seq(from=-5, to=5, length.out=42)
## # Run the simulation many times, and give a _list_ of estimated models
##   # simplify=FALSE forces the return value to be a list
## mdls <- replicate(m,  sim.gnslrm(x=x.seq,beta.0,beta.1,sigma.sq,mdl=TRUE),
##                   simplify=FALSE)
## # Extract the prediction intervals for every one of the models
## pred.ints <- sapply(mdls, extract.pred.int, x=x.new)
## rownames(pred.ints)[2:3] <- names(pred.int) # Fix the names
## # Now make plots like the previous figure
## par(mfrow=c(1,2))
## plot(1:25, y.new[1:25], xlab="Simulation number", ylab="Y", ylim=c(2,8))
## segments(x0=1:25, x1=1:25, y0=pred.ints["lwr",], y1=pred.ints["upr",], lty="dashed")
## covered <- (y.new >= pred.ints["lwr",]) & (y.new <= pred.ints["upr",])
## plot(1:m, cumsum(covered)/(1:m), xlab="Number of simulations",
##      ylab="Cumulative coverage proportion", ylim=c(0.5,1))
## abline(h=1-alpha, col="grey")
## par(mfrow=c(1,1))

## ----pred-intervals-with-fixed-coefs, echo=FALSE, out.height="0.5\\textheight"----
# What's the coverage if we use just one estimate of the model?
  # Pick the first two, arbitrarily, to show how this varies
# Get the prediction interval for our x.new
pred.ints <- sapply(mdls[1:2], extract.pred.int, x=x.new)
rownames(pred.ints)[2:3] <- c("lwr","upr")
# Make the plots
par(mfrow=c(1,2))
plot(1:25, y.new[1:25], xlab="Simulation number", ylab="Y", ylim=c(2,8))
segments(x0=1:25, x1=1:25, y0=pred.ints["lwr",1], y1=pred.ints["upr",1], lty="dashed")
# Slightly off-set one of the intervals for visibility
segments(x0=0.2+1:25, x1=0.2+1:25, y0=pred.ints["lwr",2], y1=pred.ints["upr",2],
         lty="dashed", col="red")
# Calculate two cumulative coverage proportions
covered.1 <- (y.new >= pred.ints["lwr",1]) & (y.new <= pred.ints["upr",1])
covered.2 <- (y.new >= pred.ints["lwr",2]) & (y.new <= pred.ints["upr",2])
plot(1:m, cumsum(covered.1)/(1:m), xlab="Number of simulations",
     ylab="Cumulative coverage proportion", ylim=c(0.5,1))
points(1:m, cumsum(covered.2)/(1:m), col="red")
abline(h=1-alpha, col="grey")
par(mfrow=c(1,1))

## ----eval=FALSE----------------------------------------------------------
## # What's the coverage if we use just one estimate of the model?
##   # Pick the first two, arbitrarily, to show how this varies
## # Get the prediction interval for our x.new
## pred.ints <- sapply(mdls[1:2], extract.pred.int, x=x.new)
## rownames(pred.ints)[2:3] <- c("lwr","upr")
## # Make the plots
## par(mfrow=c(1,2))
## plot(1:25, y.new[1:25], xlab="Simulation number", ylab="Y", ylim=c(2,8))
## segments(x0=1:25, x1=1:25, y0=pred.ints["lwr",1], y1=pred.ints["upr",1], lty="dashed")
## # Slightly off-set one of the intervals for visibility
## segments(x0=0.2+1:25, x1=0.2+1:25, y0=pred.ints["lwr",2], y1=pred.ints["upr",2],
##          lty="dashed", col="red")
## # Calculate two cumulative coverage proportions
## covered.1 <- (y.new >= pred.ints["lwr",1]) & (y.new <= pred.ints["upr",1])
## covered.2 <- (y.new >= pred.ints["lwr",2]) & (y.new <= pred.ints["upr",2])
## plot(1:m, cumsum(covered.1)/(1:m), xlab="Number of simulations",
##      ylab="Cumulative coverage proportion", ylim=c(0.5,1))
## points(1:m, cumsum(covered.2)/(1:m), col="red")
## abline(h=1-alpha, col="grey")
## par(mfrow=c(1,1))

## ----eval=FALSE----------------------------------------------------------
## predict(object, newdata, interval=c("none", "confidence", "prediction"), level=0.95)

## ------------------------------------------------------------------------
library(gamair); data(chicago)
death.temp.lm <- lm(death ~ tmpd, data=chicago)

## ----line-plus-limits, echo=FALSE----------------------------------------
plot(death ~ tmpd, data=chicago, pch=19, cex=0.5, col="grey", ylim=c(0,200),
     xlab="Daily mean temperature (F)", ylab="Mortality (deaths/day)")
abline(death.temp.lm)
temp.seq <- seq(from=-20, to=100, length.out=100)
death.temp.CIs <- predict(death.temp.lm, newdata=data.frame(tmpd=temp.seq),
                            interval="confidence")
lines(temp.seq, death.temp.CIs[,"lwr"], lty="dashed", col="blue")
lines(temp.seq, death.temp.CIs[,"upr"], lty="dashed", col="blue")

## ----eval=FALSE----------------------------------------------------------
## plot(death ~ tmpd, data=chicago, pch=19, cex=0.5, col="grey", ylim=c(0,200),
##      xlab="Daily mean temperature (F)", ylab="Mortality (deaths/day)")
## abline(death.temp.lm)
## temp.seq <- seq(from=-20, to=100, length.out=100)
## death.temp.CIs <- predict(death.temp.lm, newdata=data.frame(tmpd=temp.seq),
##                             interval="confidence")
## lines(temp.seq, death.temp.CIs[,"lwr"], lty="dashed", col="blue")
## lines(temp.seq, death.temp.CIs[,"upr"], lty="dashed", col="blue")

## ----pred-limits, echo=FALSE---------------------------------------------
plot(death ~ tmpd, data=chicago, pch=19, cex=0.5, col="grey", ylim=c(0,200),
     xlab="Daily mean temperature (F)", ylab="Mortality (deaths/day)")
abline(death.temp.lm)
temp.seq <- seq(from=-20, to=100, length.out=100)
death.temp.CIs <- predict(death.temp.lm, newdata=data.frame(tmpd=temp.seq),
                            interval="confidence")
lines(temp.seq, death.temp.CIs[,"lwr"], lty="dashed", col="blue")
lines(temp.seq, death.temp.CIs[,"upr"], lty="dashed", col="blue")
death.temp.PIs <- predict(death.temp.lm, newdata=data.frame(tmpd=temp.seq),
                          interval="prediction")
lines(temp.seq, death.temp.PIs[,"lwr"], col="red")
lines(temp.seq, death.temp.PIs[,"upr"], col="red")

## ----eval=FALSE----------------------------------------------------------
## plot(death ~ tmpd, data=chicago, pch=19, cex=0.5, col="grey", ylim=c(0,200),
##      xlab="Daily mean temperature (F)", ylab="Mortality (deaths/day)")
## abline(death.temp.lm)
## temp.seq <- seq(from=-20, to=100, length.out=100)
## death.temp.CIs <- predict(death.temp.lm, newdata=data.frame(tmpd=temp.seq),
##                             interval="confidence")
## lines(temp.seq, death.temp.CIs[,"lwr"], lty="dashed", col="blue")
## lines(temp.seq, death.temp.CIs[,"upr"], lty="dashed", col="blue")
## death.temp.PIs <- predict(death.temp.lm, newdata=data.frame(tmpd=temp.seq),
##                           interval="prediction")
## lines(temp.seq, death.temp.PIs[,"lwr"], col="red")
## lines(temp.seq, death.temp.PIs[,"upr"], col="red")