## ----echo=FALSE---------------------------------------------------------- library(knitr) opts_chunk$set(size="small",background="white", highlight=FALSE, cache=TRUE, autdep=TRUE, tidy=TRUE) ## ------------------------------------------------------------------------ # Simulate a Gaussian-noise simple linear regression model # Inputs: x sequence; intercept; slope; noise variance; switch for whether to # return the simulated values, or run a regression and return the estimated # model # Output: data frame or coefficient vector sim.gnslrm <- function(x, intercept, slope, sigma.sq, mdl=TRUE) { n <- length(x) y <- intercept + slope*x + rnorm(n,mean=0,sd=sqrt(sigma.sq)) if (mdl) { return(lm(y~x)) } else { return(data.frame(x=x, y=y)) } } # Read in a model and get it to give a prediction interval at a given x # This will be convenient when we want to have lots of models make predictions # at the same point # Inputs: the model, the value of x # Output: vector giving prediction interval extract.pred.int <- function(mdl,x,level=0.95) { predict(mdl,newdata=data.frame(x=x),interval="prediction",level=level) } # No magic numbers! x.new <- 1/137 m <- 1000 alpha <- 0.05 beta.0 <- 5 beta.1 <- -2 sigma.sq <- 0.1 ## ----ideal-prediction-intervals, echo=FALSE, out.height="0.5\\textheight"---- # Simulate Y from the model y.new <- sim.gnslrm(x=rep(x.new,m),beta.0,beta.1,sigma.sq,mdl=FALSE)$y # All the prediction intervals are the same (because x isn't changing) pred.int <- beta.0 + beta.1*x.new + sqrt(sigma.sq)*qnorm(c(alpha/2,1-alpha/2)) names(pred.int) <- c("lwr","upr") # Names make for clearer code par(mfrow=c(1,2)) # Set up for 2 side-by-side plots # Plot the first 25 runs of Y (so we can see what's happening) plot(1:25, y.new[1:25], xlab="Simulation number", ylab="Y", ylim=c(2,8)) # Add vertical segments for the prediction intervals segments(x0=1:25, x1=1:25, y0=pred.int["lwr"], y1=pred.int["upr"], lty="dashed") # For each Y, check if it's covered by the interval covered <- (y.new >= pred.int["lwr"]) & (y.new <= pred.int["upr"]) # Plot the running mean of the fraction of Y's covered by the interval plot(1:m, cumsum(covered)/(1:m), xlab="Number of simulations", ylab="Cumulative coverage proportion", ylim=c(0.5,1)) abline(h=1-alpha, col="grey") # Theoretical coverage level par(mfrow=c(1,1)) # Restore ordinary plot layout for later ## ----eval=FALSE---------------------------------------------------------- ## # Simulate Y from the model ## y.new <- sim.gnslrm(x=rep(x.new,m),beta.0,beta.1,sigma.sq,mdl=FALSE)$y ## # All the prediction intervals are the same (because x isn't changing) ## pred.int <- beta.0 + beta.1*x.new + sqrt(sigma.sq)*qnorm(c(alpha/2,1-alpha/2)) ## names(pred.int) <- c("lwr","upr") # Names make for clearer code ## par(mfrow=c(1,2)) # Set up for 2 side-by-side plots ## # Plot the first 25 runs of Y (so we can see what's happening) ## plot(1:25, y.new[1:25], xlab="Simulation number", ylab="Y", ylim=c(2,8)) ## # Add vertical segments for the prediction intervals ## segments(x0=1:25, x1=1:25, y0=pred.int["lwr"], y1=pred.int["upr"], lty="dashed") ## # For each Y, check if it's covered by the interval ## covered <- (y.new >= pred.int["lwr"]) & (y.new <= pred.int["upr"]) ## # Plot the running mean of the fraction of Y's covered by the interval ## plot(1:m, cumsum(covered)/(1:m), xlab="Number of simulations", ## ylab="Cumulative coverage proportion", ylim=c(0.5,1)) ## abline(h=1-alpha, col="grey") # Theoretical coverage level ## par(mfrow=c(1,1)) # Restore ordinary plot layout for later ## ----pred-int-with-random-coefs, echo=FALSE, out.height="0.5\\textheight"---- # Run simulations where we get a new estimate of the model on each run, # but with fixed X vector (to keep it simple) x.seq <- seq(from=-5, to=5, length.out=42) # Run the simulation many times, and give a _list_ of estimated models # simplify=FALSE forces the return value to be a list mdls <- replicate(m, sim.gnslrm(x=x.seq,beta.0,beta.1,sigma.sq,mdl=TRUE), simplify=FALSE) # Extract the prediction intervals for every one of the models pred.ints <- sapply(mdls, extract.pred.int, x=x.new) rownames(pred.ints)[2:3] <- names(pred.int) # Fix the names # Now make plots like the previous figure par(mfrow=c(1,2)) plot(1:25, y.new[1:25], xlab="Simulation number", ylab="Y", ylim=c(2,8)) segments(x0=1:25, x1=1:25, y0=pred.ints["lwr",], y1=pred.ints["upr",], lty="dashed") covered <- (y.new >= pred.ints["lwr",]) & (y.new <= pred.ints["upr",]) plot(1:m, cumsum(covered)/(1:m), xlab="Number of simulations", ylab="Cumulative coverage proportion", ylim=c(0.5,1)) abline(h=1-alpha, col="grey") par(mfrow=c(1,1)) ## ----eval=FALSE---------------------------------------------------------- ## # Run simulations where we get a new estimate of the model on each run, ## # but with fixed X vector (to keep it simple) ## x.seq <- seq(from=-5, to=5, length.out=42) ## # Run the simulation many times, and give a _list_ of estimated models ## # simplify=FALSE forces the return value to be a list ## mdls <- replicate(m, sim.gnslrm(x=x.seq,beta.0,beta.1,sigma.sq,mdl=TRUE), ## simplify=FALSE) ## # Extract the prediction intervals for every one of the models ## pred.ints <- sapply(mdls, extract.pred.int, x=x.new) ## rownames(pred.ints)[2:3] <- names(pred.int) # Fix the names ## # Now make plots like the previous figure ## par(mfrow=c(1,2)) ## plot(1:25, y.new[1:25], xlab="Simulation number", ylab="Y", ylim=c(2,8)) ## segments(x0=1:25, x1=1:25, y0=pred.ints["lwr",], y1=pred.ints["upr",], lty="dashed") ## covered <- (y.new >= pred.ints["lwr",]) & (y.new <= pred.ints["upr",]) ## plot(1:m, cumsum(covered)/(1:m), xlab="Number of simulations", ## ylab="Cumulative coverage proportion", ylim=c(0.5,1)) ## abline(h=1-alpha, col="grey") ## par(mfrow=c(1,1)) ## ----pred-intervals-with-fixed-coefs, echo=FALSE, out.height="0.5\\textheight"---- # What's the coverage if we use just one estimate of the model? # Pick the first two, arbitrarily, to show how this varies # Get the prediction interval for our x.new pred.ints <- sapply(mdls[1:2], extract.pred.int, x=x.new) rownames(pred.ints)[2:3] <- c("lwr","upr") # Make the plots par(mfrow=c(1,2)) plot(1:25, y.new[1:25], xlab="Simulation number", ylab="Y", ylim=c(2,8)) segments(x0=1:25, x1=1:25, y0=pred.ints["lwr",1], y1=pred.ints["upr",1], lty="dashed") # Slightly off-set one of the intervals for visibility segments(x0=0.2+1:25, x1=0.2+1:25, y0=pred.ints["lwr",2], y1=pred.ints["upr",2], lty="dashed", col="red") # Calculate two cumulative coverage proportions covered.1 <- (y.new >= pred.ints["lwr",1]) & (y.new <= pred.ints["upr",1]) covered.2 <- (y.new >= pred.ints["lwr",2]) & (y.new <= pred.ints["upr",2]) plot(1:m, cumsum(covered.1)/(1:m), xlab="Number of simulations", ylab="Cumulative coverage proportion", ylim=c(0.5,1)) points(1:m, cumsum(covered.2)/(1:m), col="red") abline(h=1-alpha, col="grey") par(mfrow=c(1,1)) ## ----eval=FALSE---------------------------------------------------------- ## # What's the coverage if we use just one estimate of the model? ## # Pick the first two, arbitrarily, to show how this varies ## # Get the prediction interval for our x.new ## pred.ints <- sapply(mdls[1:2], extract.pred.int, x=x.new) ## rownames(pred.ints)[2:3] <- c("lwr","upr") ## # Make the plots ## par(mfrow=c(1,2)) ## plot(1:25, y.new[1:25], xlab="Simulation number", ylab="Y", ylim=c(2,8)) ## segments(x0=1:25, x1=1:25, y0=pred.ints["lwr",1], y1=pred.ints["upr",1], lty="dashed") ## # Slightly off-set one of the intervals for visibility ## segments(x0=0.2+1:25, x1=0.2+1:25, y0=pred.ints["lwr",2], y1=pred.ints["upr",2], ## lty="dashed", col="red") ## # Calculate two cumulative coverage proportions ## covered.1 <- (y.new >= pred.ints["lwr",1]) & (y.new <= pred.ints["upr",1]) ## covered.2 <- (y.new >= pred.ints["lwr",2]) & (y.new <= pred.ints["upr",2]) ## plot(1:m, cumsum(covered.1)/(1:m), xlab="Number of simulations", ## ylab="Cumulative coverage proportion", ylim=c(0.5,1)) ## points(1:m, cumsum(covered.2)/(1:m), col="red") ## abline(h=1-alpha, col="grey") ## par(mfrow=c(1,1)) ## ----eval=FALSE---------------------------------------------------------- ## predict(object, newdata, interval=c("none", "confidence", "prediction"), level=0.95) ## ------------------------------------------------------------------------ library(gamair); data(chicago) death.temp.lm <- lm(death ~ tmpd, data=chicago) ## ----line-plus-limits, echo=FALSE---------------------------------------- plot(death ~ tmpd, data=chicago, pch=19, cex=0.5, col="grey", ylim=c(0,200), xlab="Daily mean temperature (F)", ylab="Mortality (deaths/day)") abline(death.temp.lm) temp.seq <- seq(from=-20, to=100, length.out=100) death.temp.CIs <- predict(death.temp.lm, newdata=data.frame(tmpd=temp.seq), interval="confidence") lines(temp.seq, death.temp.CIs[,"lwr"], lty="dashed", col="blue") lines(temp.seq, death.temp.CIs[,"upr"], lty="dashed", col="blue") ## ----eval=FALSE---------------------------------------------------------- ## plot(death ~ tmpd, data=chicago, pch=19, cex=0.5, col="grey", ylim=c(0,200), ## xlab="Daily mean temperature (F)", ylab="Mortality (deaths/day)") ## abline(death.temp.lm) ## temp.seq <- seq(from=-20, to=100, length.out=100) ## death.temp.CIs <- predict(death.temp.lm, newdata=data.frame(tmpd=temp.seq), ## interval="confidence") ## lines(temp.seq, death.temp.CIs[,"lwr"], lty="dashed", col="blue") ## lines(temp.seq, death.temp.CIs[,"upr"], lty="dashed", col="blue") ## ----pred-limits, echo=FALSE--------------------------------------------- plot(death ~ tmpd, data=chicago, pch=19, cex=0.5, col="grey", ylim=c(0,200), xlab="Daily mean temperature (F)", ylab="Mortality (deaths/day)") abline(death.temp.lm) temp.seq <- seq(from=-20, to=100, length.out=100) death.temp.CIs <- predict(death.temp.lm, newdata=data.frame(tmpd=temp.seq), interval="confidence") lines(temp.seq, death.temp.CIs[,"lwr"], lty="dashed", col="blue") lines(temp.seq, death.temp.CIs[,"upr"], lty="dashed", col="blue") death.temp.PIs <- predict(death.temp.lm, newdata=data.frame(tmpd=temp.seq), interval="prediction") lines(temp.seq, death.temp.PIs[,"lwr"], col="red") lines(temp.seq, death.temp.PIs[,"upr"], col="red") ## ----eval=FALSE---------------------------------------------------------- ## plot(death ~ tmpd, data=chicago, pch=19, cex=0.5, col="grey", ylim=c(0,200), ## xlab="Daily mean temperature (F)", ylab="Mortality (deaths/day)") ## abline(death.temp.lm) ## temp.seq <- seq(from=-20, to=100, length.out=100) ## death.temp.CIs <- predict(death.temp.lm, newdata=data.frame(tmpd=temp.seq), ## interval="confidence") ## lines(temp.seq, death.temp.CIs[,"lwr"], lty="dashed", col="blue") ## lines(temp.seq, death.temp.CIs[,"upr"], lty="dashed", col="blue") ## death.temp.PIs <- predict(death.temp.lm, newdata=data.frame(tmpd=temp.seq), ## interval="prediction") ## lines(temp.seq, death.temp.PIs[,"lwr"], col="red") ## lines(temp.seq, death.temp.PIs[,"upr"], col="red")