# Introductory demo of additive models # After an example in Simon N. Wood, _Generalized Additve Models: An # Introduction with R_, chapter 5. # Load one of the two libraries for additive models require(mgcv) # Load package and data set require(gamair); data(chicago) # Peek at the data head(chicago) # For visualization, it will help to know what time==0 actually means day.zero <- as.Date("1993-12-31") # Add a column with the date # Why not cbind() here? chicago\$date <- chicago\$time+day.zero # Plot the data: deaths vs. time plot(chicago\$time+day.zero,chicago\$death,xlab="date",ylab="deaths per day", pch=16,cex=0.3) # Or: ### plot(death ~ date, data=chicago, ylab="deaths per day",pch=16,cex=0.3) # Fit a smoothing spline smooth.death <- smooth.spline(x=chicago\$time,y=chicago\$death) # Add spline to plot lines(chicago\$date,smooth.death\$y,col="red",lwd=4) # RMSE (in sample), RMSE (cross-validated), and two versions of R^2 sqrt(mean(residuals(smooth.death)^2)) # In-sample root MSE sqrt(smooth.death\$cv.crit) # Cross-validated root MSE (cor(smooth.death\$y,chicago\$death))^2 # R^2, take 1 var(smooth.death\$y)/var(chicago\$death) # R^2, take 2 plot(fitted(smooth.death),residuals(smooth.death),pch=16,cex=0.3) abline(h=0,col="grey",lwd=3) # Form running averages over lagging windows # Exercise: Re-write to eliminate the for loop lag.mean <- function(x, window) { n <- length(x) y <- rep(0,n-window) # If we're using a loop, pre-allocate the final object for (t in 0:window) { y <- y + x[(t+1):(n-window+t)] } return(y/(window+1)) } chicago.avg <- chicago[-(1:3),c("death","time","date")] pollution.and.temp <- apply(chicago[,c(2,4,5,7)],2,lag.mean,window=3) chicago.avg <- data.frame(chicago.avg,pollution.and.temp) # Why don't we take running averages of deaths and time? # Fit an additive model, excluding date/time as a predictor require(mgcv) chicago.gam.0 <- gam(death~s(so2median)+s(pm10median) +s(tmpd)+s(o3median), data=chicago.avg,na.action=na.exclude) # Plot the partial response functions plot(chicago.gam.0,select=1, xlab="Concentration of sulfur dioxide (deviation from baseline)", ylab="Predicted change in deaths per day") plot(chicago.gam.0,select=2, xlab="Concentration of smaller particulates (deviation from baseline)", ylab="Predicted change in deaths per day") plot(chicago.gam.0,select=3, xlab="Temperature (Fahrenheit)", ylab="Predicted change in deaths per day") plot(chicago.gam.0,select=4, xlab="Concentration of ozone (deviation from baseline)", ylab="Predicted change in deaths per day") # Plot over time plot(death~date,data=chicago,ylab="deaths per day",pch=16,cex=0.3) lines(chicago\$date,smooth.death\$y,col="red",lwd=8) lines(chicago.avg\$date,fitted(chicago.gam.0),col="blue",lwd=4) plot(fitted(chicago.gam.0),residuals(chicago.gam.0),xlab="Predicted", ylab="Residuals",pch=16,cex=0.3) abline(h=0,col="grey",lwd=3) # Fit additive model, excluding date as a predictor but interacting # temperature and ozone chicago.gam.1 <- gam(death~s(so2median) +s(pm10median) +s(tmpd,o3median), data=chicago.avg,na.action=na.exclude) # Plot partial response functions plot(chicago.gam.1,select=1, xlab="Concentration of sulfur dioxide [deviation from baseline]", ylab="Predicted change in deaths per day") plot(chicago.gam.1,select=2, xlab="Concentration of particulates [deviation from baseline]", ylab="Predicted change in deaths per day") plot(chicago.gam.1,select=3,xlab="Temperature (Fahrenheit)", ylab="Concentration of ozone [deviation from baseline]", se=FALSE,scheme=0,rug=FALSE) # Plot over time plot(death~date,data=chicago,ylab="deaths per day",pch=16,cex=0.3) lines(chicago\$date,smooth.death\$y,col="red",lwd=4) lines(chicago.avg\$date,fitted(chicago.gam.0),col="blue") lines(chicago.avg\$date,fitted(chicago.gam.1),col="orange")