# Examples from class, 14 February 2012 # Load required packages and the data set library(mgcv) library(gamair) data(chicago) # Examine the data set: colnames(chicago) summary(chicago) # "pm25median" is full of NAs, not collected until the latter part of the # data, so let's just drop it chicago <- chicago[,-3] # Annoyingly, while we can select columns by name, we cannot _deselect_ # them by name. However, see Recipe 5.28 in the R Cookbook # Confirm that we've dropped the right column colnames(chicago) # Fit a smoothing spline for log(death) vs. time smooth.death <- smooth.spline(x=chicago$time,y=log(chicago$death)) # How many degrees of freedom? smooth.deaf$df # 149.5 and change effective degrees of freedom # Plot the data day.zero <- as.Date("1993-12-31") # Creates a Date object with this as the starting date; addition or # subtract of days now moves us through the calendar plot(chicago$time + day.zero,log(chicago$death),pch=16,cex=0.6) lines(smooth.death,col="red",lwd=4) # Does nothing but produces no error --- mis-match for x coordinates! lines(x=smooth.death$x+day.zero, y=smooth.death$y, col="red",lwd=4) # Change the number of degrees of freedom: smooth.df.10 <- smooth.spline(x=chicago$time,y=log(chicago$death),df=10) # check that this took: smmoth.df.10$df # 10.00131 --- it tries the best it can! # Add to the plot: could repeat the manipulations, but bundle it up into # a function: add.spline.to.plot <- function(a.spline,col) { lines(a.spline$x+day.zero, y=a.spline$y,col=col) } add.spline.to.plot(smooth.df.10,col="blue") # Very flat smooth.df.500 <- smooth.spline(x=chicago$time,y=log(chicago$death),df=500) add.spline.to.plot(smooth.df.500,col="purple") # Very wiggly, but hardly more wiggly than the ~150 d.f. picked by # cross-validation # Fit the additive model: chicago.gam.1 <- gam(log(death)~s(pm10median)+s(o3median)+s(so2median)+ s(time)+s(tmpd),data=chicago,na.action=na.exclude) # na.exclude means to drop rows of the data frame with incomplete values # when _estimating_ the model, but include NAs for them in the fitted or # predicted values. The default na.action is na.omit, which drops # those rows from estimation, but does not pad out the fitted values chicago.gam.1 # look at the output summary(chicago.gam.1) # some more summary numbers plot(chicago.gam.1,pages=1) # all partial response functions on one plot # Confirm that we have a fitted value for every day: dim(chicago) # 5114 rows length(fitted(chicago.gam.1)) # 5114 # Re-draw our basic plot plot(x=as.Date("1993-12-31")+chicago$time,y=log(chicago$death),pch=16,cex=0.6) add.spline.to.plot(smooth.death,col="red") lines(chicago$time+day.zero,fitted(chicago.gam.1),col="purple") # A close examination will show some gaps, where the GAM can't make # predictions because of missing data # Alternative approach: what if we don't know about the na.exclude? chicago.gam.1b <- gam(log(death)~s(pm10median)+s(o3median)+s(so2median)+ s(time)+s(tmpd),data=chicago) # No fitted values for some days at all: length(fitted(chicago.gam.1b)) # [1] 4841 # How to figure out which days those are? # First option: look! head(fitted(chicago.gam.1b)) # 1 3 4 6 7 8 # 4.819072 4.814140 4.843185 4.801855 4.798527 4.811445 # Row numbers are retained as names here # Names are characters, convert to numbers: good.days <- as.numeric(names(fitted(chicago.gam.1b))) # Second option: filter any.na <- function(x) { any(is.na(x)) } # is.na(x) returns TRUE or FALSE depending on whether the elements of x are # or are not NA # any(b) returns TRUE if any of the elements of b are TRUE, else FALSE # so any.na(x) will return TRUE if any element of x is NA bad.days <- which(apply(chicago,1,any.na)) # For each row of chicago, check whether the row has any NAs; return the # row numbers with missing data # Cross-check: good days + bad days = total days length(bad.days)+length(good.days) == nrow(chicago) # TRUE # Cross-check: no day should be both good and bad intersect(bad.days,good.days) # numeric(0) # intersect(x,y) treats the vectors as sets, and returns their common values # (intersection), or an empty value if there is no intersection # similarly for union(x,y) all(sort(union(bad.days,good.days)) == 1:nrow(chicago)) # TRUE # Now: lines(chicago$times[good.days]+day.zero,fitted(chicago.gam.1b),col="blue") # Should cover over previous line! # Notice that we've had to plot the fitted values twice, and it was a very # similar operation both times. And we'll have to do it some more in # the future. Write a function to handle the repetive parts: lines.chicago.fits <- function(model,data=chicago,subset=1:nrow(data),...) { # Extract the time component of the data frame on the specified rows # by default, all rows dates <- data$time[subset] + day.zero # Draws lines, plotting dates vs. fitted values lines(x=dates,y=fitted(model),...) # The ... passes along any additional, but optional, arguments we may want # to give, here graphics options (say col, lty, lwd, etc.) } lines.chicago.fits(model=chicago.gam.1,col="purple",lwd=3) lines.chicago.fits(model=chicago.gam.1b,subset=good.days,col="blue") # Should have the thinner blue line overlaid on the purple ############################## # Looking at averaged values # ############################## # Assignment-provided function to do an average of the data over a trailing # (lagging) window lag.mean <- function(x, window) { n <- length(x) y <- rep(0,n-window) for (t in 0:window) { y <- y + x[(t+1):(n-window+t)] } return(y/(window+1)) } # Confirm that this works: look at the beginning of one series chicago$death[1:6] lag.mean(chicago$death[1:6],1) # Average each day with yesterday # Check by hand that this gives the right 6-1=5 numbers lag.mean(chicago$death[1:6],3) # Average each day with 3 previous days # Again, check by hand that this gives the right 6-3=3 numbers # We want to make a data frame which averages temperature and pollution over # lagging 3-day windows, but not deaths or the date. #### Option A: the hard way # copy the data frame chicago.avg <- chicago is.data.frame(chicago.avg) # TRUE colnames(chicago.avg) # matches colnames(chicago) by inspection all(colnames(chicago.avg) == colnames(chicago)) # TRUE # A 3-day window means we don't get our first averaged value until day 4 chicago.avg <- chicago.avg[-(1:3),] # Apply averaging to each pollution or temperature column of the original chicago.avg$pm10median <- lag.mean(chicago$pm10median,3) chicago.avg$o3median <- lag.mean(chicago$o3median,3) chicago.avg$so2median <- lag.mean(chicago$so2median,3) chicago.avg$tmpd <- lag.mean(chicago$tmpd,3) # Check our work: dim(chicago) # Lost 3 days, right number of columns all(colnames(chicago.avg)==colnames(chicago)) # Make sure names haven't changed # Manual check the averaging for the first few rows: head(chicago.avg) head(chicago) # If paranoid/application is important, pick some rows at random and check # there too rm(chicago.avg) # Make our previous averages go away (not strictly necessary, but confirms # that there is no trickery here) ##### Option B: # Rather than write out the averaging multiple times, tell R to do it to # selected columns of chicago # See Recipe 6.4 in the R Cookbook # Looking at the column names, pollution and temperature occupies columns # 2, 3, 4 and 6 chicago.avg <- apply(chicago[,c(2,3,4,6)],2,lag.mean,window=3) is.data.frame(chicago.avg) # FALSE: apply busts it down to an array # Add in the appropriate parts of the non-averaged variables, convert to # data frame chicago.avg <- data.frame(chicago.avg,time=chicago$time[-(1:3)], death=chicago$death[-(1:3)]) colnames(chicago.avg) # Same set of names but a different order # Check values as before ##### Option B-prime: rm(chicago.avg) # Average ALL the columns chicago.avg <- apply(chicago,2,lag.mean,window=3) # replace the averaged deaths and times with appropriate values chicago.avg$death <- chicago$death[-(1:3)] chicago.avg$time <- chicago$time[-(1:3)] # Convert back to data frame chicago.avg <- as.data.frame(chicago.avg) # Fit the gam to the time-averaged values chicago.gam.2 <- gam(log(death)~s(pm10median)+s(o3median)+s(so2median)+s(time)+ s(tmpd),data=chicago.avg,na.action=na.exclude) # A trick to avoid typing the same formula over and over: #### gam(formula(chicago.gam.1),data=chicago.avg,na.action=na.exclude) # would take the formula out of chicago.gam.1, and use it as the formula # here length(fitted(chicago.gam.2)) # 5111, one per day in reduced data set summary(fitted(chicago.gam.2)) # Lots of NAs # Add yet another line, but for the fitted values of the gam lines.chicago.fits(model=chicago.gam.2,data=chicago.avg,col="green") # Note use of data argument in our function ####### Examination around the outliers # Find the row number of the highest-death day worst.day <- which.max(chicago.avg$death) # look 3 days before and after chicago.avg[(worst.day-3):(worst.day+3),] # look a year before that chicago.avg[(worst.day-3-365):(worst.day+3-365),] # Notice: during the bad patch, temperature maybe 10+ degrees higher # particular levels round 20--30, not around 0 # ozone around 20--30, not around 2--5 # difference in so2 not so striking # Suggests trying interactions of ozone with temperature or particulates with # temperature, or indeed of particulates with ozone # Let's say ozone and temperature chicago.gam.3 <- gam(log(death) ~ s(tmpd,o3median) + s(so2median) + s(time) + s(pm10median), data=chicago.avg,na.action=na.exclude)