# Examples from class, 14 February 2012
# Load required packages and the data set
library(mgcv)
library(gamair)
data(chicago)
# Examine the data set:
colnames(chicago)
summary(chicago)
# "pm25median" is full of NAs, not collected until the latter part of the
# data, so let's just drop it
chicago <- chicago[,-3]
# Annoyingly, while we can select columns by name, we cannot _deselect_
# them by name. However, see Recipe 5.28 in the R Cookbook
# Confirm that we've dropped the right column
colnames(chicago)
# Fit a smoothing spline for log(death) vs. time
smooth.death <- smooth.spline(x=chicago$time,y=log(chicago$death))
# How many degrees of freedom?
smooth.deaf$df
# 149.5 and change effective degrees of freedom
# Plot the data
day.zero <- as.Date("1993-12-31")
# Creates a Date object with this as the starting date; addition or
# subtract of days now moves us through the calendar
plot(chicago$time + day.zero,log(chicago$death),pch=16,cex=0.6)
lines(smooth.death,col="red",lwd=4)
# Does nothing but produces no error --- mis-match for x coordinates!
lines(x=smooth.death$x+day.zero, y=smooth.death$y, col="red",lwd=4)
# Change the number of degrees of freedom:
smooth.df.10 <- smooth.spline(x=chicago$time,y=log(chicago$death),df=10)
# check that this took:
smmoth.df.10$df
# 10.00131 --- it tries the best it can!
# Add to the plot: could repeat the manipulations, but bundle it up into
# a function:
add.spline.to.plot <- function(a.spline,col) {
lines(a.spline$x+day.zero, y=a.spline$y,col=col)
}
add.spline.to.plot(smooth.df.10,col="blue")
# Very flat
smooth.df.500 <- smooth.spline(x=chicago$time,y=log(chicago$death),df=500)
add.spline.to.plot(smooth.df.500,col="purple")
# Very wiggly, but hardly more wiggly than the ~150 d.f. picked by
# cross-validation
# Fit the additive model:
chicago.gam.1 <- gam(log(death)~s(pm10median)+s(o3median)+s(so2median)+
s(time)+s(tmpd),data=chicago,na.action=na.exclude)
# na.exclude means to drop rows of the data frame with incomplete values
# when _estimating_ the model, but include NAs for them in the fitted or
# predicted values. The default na.action is na.omit, which drops
# those rows from estimation, but does not pad out the fitted values
chicago.gam.1 # look at the output
summary(chicago.gam.1) # some more summary numbers
plot(chicago.gam.1,pages=1) # all partial response functions on one plot
# Confirm that we have a fitted value for every day:
dim(chicago) # 5114 rows
length(fitted(chicago.gam.1)) # 5114
# Re-draw our basic plot
plot(x=as.Date("1993-12-31")+chicago$time,y=log(chicago$death),pch=16,cex=0.6)
add.spline.to.plot(smooth.death,col="red")
lines(chicago$time+day.zero,fitted(chicago.gam.1),col="purple")
# A close examination will show some gaps, where the GAM can't make
# predictions because of missing data
# Alternative approach: what if we don't know about the na.exclude?
chicago.gam.1b <- gam(log(death)~s(pm10median)+s(o3median)+s(so2median)+
s(time)+s(tmpd),data=chicago)
# No fitted values for some days at all:
length(fitted(chicago.gam.1b))
# [1] 4841
# How to figure out which days those are?
# First option: look!
head(fitted(chicago.gam.1b))
# 1 3 4 6 7 8
# 4.819072 4.814140 4.843185 4.801855 4.798527 4.811445
# Row numbers are retained as names here
# Names are characters, convert to numbers:
good.days <- as.numeric(names(fitted(chicago.gam.1b)))
# Second option: filter
any.na <- function(x) { any(is.na(x)) }
# is.na(x) returns TRUE or FALSE depending on whether the elements of x are
# or are not NA
# any(b) returns TRUE if any of the elements of b are TRUE, else FALSE
# so any.na(x) will return TRUE if any element of x is NA
bad.days <- which(apply(chicago,1,any.na))
# For each row of chicago, check whether the row has any NAs; return the
# row numbers with missing data
# Cross-check: good days + bad days = total days
length(bad.days)+length(good.days) == nrow(chicago) # TRUE
# Cross-check: no day should be both good and bad
intersect(bad.days,good.days) # numeric(0)
# intersect(x,y) treats the vectors as sets, and returns their common values
# (intersection), or an empty value if there is no intersection
# similarly for union(x,y)
all(sort(union(bad.days,good.days)) == 1:nrow(chicago)) # TRUE
# Now:
lines(chicago$times[good.days]+day.zero,fitted(chicago.gam.1b),col="blue")
# Should cover over previous line!
# Notice that we've had to plot the fitted values twice, and it was a very
# similar operation both times. And we'll have to do it some more in
# the future. Write a function to handle the repetive parts:
lines.chicago.fits <- function(model,data=chicago,subset=1:nrow(data),...) {
# Extract the time component of the data frame on the specified rows
# by default, all rows
dates <- data$time[subset] + day.zero
# Draws lines, plotting dates vs. fitted values
lines(x=dates,y=fitted(model),...)
# The ... passes along any additional, but optional, arguments we may want
# to give, here graphics options (say col, lty, lwd, etc.)
}
lines.chicago.fits(model=chicago.gam.1,col="purple",lwd=3)
lines.chicago.fits(model=chicago.gam.1b,subset=good.days,col="blue")
# Should have the thinner blue line overlaid on the purple
##############################
# Looking at averaged values #
##############################
# Assignment-provided function to do an average of the data over a trailing
# (lagging) window
lag.mean <- function(x, window) {
n <- length(x)
y <- rep(0,n-window)
for (t in 0:window) {
y <- y + x[(t+1):(n-window+t)]
}
return(y/(window+1))
}
# Confirm that this works: look at the beginning of one series
chicago$death[1:6]
lag.mean(chicago$death[1:6],1) # Average each day with yesterday
# Check by hand that this gives the right 6-1=5 numbers
lag.mean(chicago$death[1:6],3) # Average each day with 3 previous days
# Again, check by hand that this gives the right 6-3=3 numbers
# We want to make a data frame which averages temperature and pollution over
# lagging 3-day windows, but not deaths or the date.
#### Option A: the hard way
# copy the data frame
chicago.avg <- chicago
is.data.frame(chicago.avg) # TRUE
colnames(chicago.avg) # matches colnames(chicago) by inspection
all(colnames(chicago.avg) == colnames(chicago)) # TRUE
# A 3-day window means we don't get our first averaged value until day 4
chicago.avg <- chicago.avg[-(1:3),]
# Apply averaging to each pollution or temperature column of the original
chicago.avg$pm10median <- lag.mean(chicago$pm10median,3)
chicago.avg$o3median <- lag.mean(chicago$o3median,3)
chicago.avg$so2median <- lag.mean(chicago$so2median,3)
chicago.avg$tmpd <- lag.mean(chicago$tmpd,3)
# Check our work:
dim(chicago) # Lost 3 days, right number of columns
all(colnames(chicago.avg)==colnames(chicago)) # Make sure names haven't changed
# Manual check the averaging for the first few rows:
head(chicago.avg)
head(chicago)
# If paranoid/application is important, pick some rows at random and check
# there too
rm(chicago.avg)
# Make our previous averages go away (not strictly necessary, but confirms
# that there is no trickery here)
##### Option B:
# Rather than write out the averaging multiple times, tell R to do it to
# selected columns of chicago
# See Recipe 6.4 in the R Cookbook
# Looking at the column names, pollution and temperature occupies columns
# 2, 3, 4 and 6
chicago.avg <- apply(chicago[,c(2,3,4,6)],2,lag.mean,window=3)
is.data.frame(chicago.avg) # FALSE: apply busts it down to an array
# Add in the appropriate parts of the non-averaged variables, convert to
# data frame
chicago.avg <- data.frame(chicago.avg,time=chicago$time[-(1:3)],
death=chicago$death[-(1:3)])
colnames(chicago.avg) # Same set of names but a different order
# Check values as before
##### Option B-prime:
rm(chicago.avg)
# Average ALL the columns
chicago.avg <- apply(chicago,2,lag.mean,window=3)
# replace the averaged deaths and times with appropriate values
chicago.avg$death <- chicago$death[-(1:3)]
chicago.avg$time <- chicago$time[-(1:3)]
# Convert back to data frame
chicago.avg <- as.data.frame(chicago.avg)
# Fit the gam to the time-averaged values
chicago.gam.2 <- gam(log(death)~s(pm10median)+s(o3median)+s(so2median)+s(time)+
s(tmpd),data=chicago.avg,na.action=na.exclude)
# A trick to avoid typing the same formula over and over:
#### gam(formula(chicago.gam.1),data=chicago.avg,na.action=na.exclude)
# would take the formula out of chicago.gam.1, and use it as the formula
# here
length(fitted(chicago.gam.2)) # 5111, one per day in reduced data set
summary(fitted(chicago.gam.2)) # Lots of NAs
# Add yet another line, but for the fitted values of the gam
lines.chicago.fits(model=chicago.gam.2,data=chicago.avg,col="green")
# Note use of data argument in our function
####### Examination around the outliers
# Find the row number of the highest-death day
worst.day <- which.max(chicago.avg$death)
# look 3 days before and after
chicago.avg[(worst.day-3):(worst.day+3),]
# look a year before that
chicago.avg[(worst.day-3-365):(worst.day+3-365),]
# Notice: during the bad patch, temperature maybe 10+ degrees higher
# particular levels round 20--30, not around 0
# ozone around 20--30, not around 2--5
# difference in so2 not so striking
# Suggests trying interactions of ozone with temperature or particulates with
# temperature, or indeed of particulates with ozone
# Let's say ozone and temperature
chicago.gam.3 <- gam(log(death) ~ s(tmpd,o3median) + s(so2median)
+ s(time) + s(pm10median), data=chicago.avg,na.action=na.exclude)