# Introductory demo of additive models
  # After an example in Simon N. Wood, _Generalized Additve Models: An
  # Introduction with R_, chapter 5.


# Load one of the two libraries for additive models
require(mgcv)
# Load package and data set
require(gamair); data(chicago)
# Peek at the data
head(chicago)
# For visualization, it will help to know what time==0 actually means
day.zero <- as.Date("1993-12-31")
# Add a column with the date
  # Why not cbind() here?
chicago$date <- chicago$time+day.zero

# Plot the data: deaths vs. time
plot(chicago$time+day.zero,chicago$death,xlab="date",ylab="deaths per day",
  pch=16,cex=0.3)
# Or:
### plot(death ~ date, data=chicago, ylab="deaths per day",pch=16,cex=0.3)

# Fit a smoothing spline
smooth.death <- smooth.spline(x=chicago$time,y=chicago$death)
# Add spline to plot
lines(chicago$date,smooth.death$y,col="red",lwd=4)
# RMSE (in sample), RMSE (cross-validated), and two versions of R^2
sqrt(mean(residuals(smooth.death)^2))   # In-sample root MSE
sqrt(smooth.death$cv.crit)              # Cross-validated root MSE
(cor(smooth.death$y,chicago$death))^2   # R^2, take 1
var(smooth.death$y)/var(chicago$death)  # R^2, take 2
plot(fitted(smooth.death),residuals(smooth.death),pch=16,cex=0.3)
abline(h=0,col="grey",lwd=3)

# Form running averages over lagging windows
  # Exercise: Re-write to eliminate the for loop
lag.mean <- function(x, window) {
  n <- length(x)
  y <- rep(0,n-window) # If we're using a loop, pre-allocate the final object
  for (t in 0:window) {
    y <- y + x[(t+1):(n-window+t)]
  }
  return(y/(window+1))
}
chicago.avg <- chicago[-(1:3),c("death","time","date")]
pollution.and.temp <- apply(chicago[,c(2,4,5,7)],2,lag.mean,window=3)
chicago.avg <- data.frame(chicago.avg,pollution.and.temp)
   # Why don't we take running averages of deaths and time?

# Fit an additive model, excluding date/time as a predictor
require(mgcv)
chicago.gam.0 <- gam(death~s(so2median)+s(pm10median)
                     +s(tmpd)+s(o3median),
  data=chicago.avg,na.action=na.exclude)
# Plot the partial response functions
plot(chicago.gam.0,select=1,
  xlab="Concentration of sulfur dioxide (deviation from baseline)",
  ylab="Predicted change in deaths per day")
plot(chicago.gam.0,select=2,
  xlab="Concentration of smaller particulates (deviation from baseline)",
  ylab="Predicted change in deaths per day")
plot(chicago.gam.0,select=3,
  xlab="Temperature (Fahrenheit)",
  ylab="Predicted change in deaths per day")
plot(chicago.gam.0,select=4,
  xlab="Concentration of ozone (deviation from baseline)",
  ylab="Predicted change in deaths per day")
# Plot over time
plot(death~date,data=chicago,ylab="deaths per day",pch=16,cex=0.3)
lines(chicago$date,smooth.death$y,col="red",lwd=8)
lines(chicago.avg$date,fitted(chicago.gam.0),col="blue",lwd=4)

plot(fitted(chicago.gam.0),residuals(chicago.gam.0),xlab="Predicted",
  ylab="Residuals",pch=16,cex=0.3)
abline(h=0,col="grey",lwd=3)


# Fit additive model, excluding date as a predictor but interacting
  # temperature and ozone
chicago.gam.1 <- gam(death~s(so2median)
                     +s(pm10median)
                     +s(tmpd,o3median),
  data=chicago.avg,na.action=na.exclude)
# Plot partial response functions
plot(chicago.gam.1,select=1,
  xlab="Concentration of sulfur dioxide [deviation from baseline]",
  ylab="Predicted change in deaths per day")
plot(chicago.gam.1,select=2,
  xlab="Concentration of particulates [deviation from baseline]",
  ylab="Predicted change in deaths per day")
plot(chicago.gam.1,select=3,xlab="Temperature (Fahrenheit)",
  ylab="Concentration of ozone [deviation from baseline]",
  se=FALSE,scheme=0,rug=FALSE)
# Plot over time
plot(death~date,data=chicago,ylab="deaths per day",pch=16,cex=0.3)
lines(chicago$date,smooth.death$y,col="red",lwd=4)
lines(chicago.avg$date,fitted(chicago.gam.0),col="blue")
lines(chicago.avg$date,fitted(chicago.gam.1),col="orange")