library(faraway) # Snoqualmie Falls precipitation example # Data consists of precipitation, in 1/100 of an inch, for each day for # each year from 1948 through 1983, at Snoqualmie Falls, Wash. # load: snoqualmie <- read.csv("snoqualmie.csv",header=FALSE) # Let's get rid of the year breaks snoqualmie <- unlist(snoqualmie) # Most years have NAs for the 366th day, let's remove that snoqualmie <- na.omit(snoqualmie) # Look at the histogram, plot(hist(snoqualmie,n=50,probability=TRUE),xlab="Precipitation (1/100 inch)") rug(snoqualmie,col="grey") # and the scatter-plot of successive values plot(snoqualmie[-length(snoqualmie)],snoqualmie[-1], xlab="Precipitation today (1/100 inch)", ylab="Precipitation tomorrow (1/100 inch)",cex=0.1) rug(snoqualmie[-length(snoqualmie)],side=1,col="grey") rug(snoqualmie[-1],side=2,col="grey") # both show big spikes at zero precipitation. We'll try predicting whether # there will be any precipitation tomorrow, from how much there was today. # Re-shape so we have pairs of sequential days vector.to.pairs <- function(v) { v <- as.numeric(v) n <- length(v) return(cbind(v[-1],v[-n])) } snoq.pairs <- vector.to.pairs(snoqualmie) colnames(snoq.pairs) <- c("tomorrow","today") snoq <- as.data.frame(snoq.pairs) # Fit the logistic regression model snoq.logistic <- glm((tomorrow > 0) ~ today, data=snoq, family=binomial) # Look at coeffcients etc. --- positive coefficient on today's precipitation, # as we'd expected from the scatterplot and background knowledge summary(snoq.logistic) # Plot data, plus predicted probabilities with confidence interval on predicted # probability plot((tomorrow>0)~today,data=snoq,xlab="Precipitation today (1/100 inch)", ylab="Positive precipitation tomorrow?") rug(snoq$today,col="grey") # Fake data to give visually-nice, evenly-spaced points for plotting data.plot <- data.frame(today=(0:500)) # Work out confidence interval on the logit scale (where things are roughly # Gaussian), then transform to probability scale logistic.predictions <- predict(snoq.logistic,newdata=data.plot,se.fit=TRUE) lines(0:500,ilogit(logistic.predictions$fit)) lines(0:500,ilogit(logistic.predictions$fit+1.96*logistic.predictions$se.fit), lty=2) lines(0:500,ilogit(logistic.predictions$fit-1.96*logistic.predictions$se.fit), lty=2) # Simple spline smoothing for comparison snoq.spline <- smooth.spline(x=snoq$today,y=(snoq$tomorrow>0)) # Add the spline to the plot lines(snoq.spline,col="red") # Pretty good match to the logistic regression up to about 1.3 inches # Try a GAM, which moves the smoother _inside_ the likelihood model library(mgcv) # Note: you may need to un-load the gam library with detach() snoq.gam <- gam((tomorrow>0)~s(today),data=snoq,family=binomial) # Add the GAM to the plot (again using the logit-scale trick) gam.predictions <- predict.gam(snoq.gam,newdata=data.plot,se.fit=TRUE) lines(0:500,ilogit(gam.predictions$fit),col="blue") lines(0:500,ilogit(gam.predictions$fit+1.96*gam.predictions$se.fit),col="blue", lty=2) lines(0:500,ilogit(gam.predictions$fit-1.96*gam.predictions$se.fit),col="blue", lty=2) # Note: similar to the simple spline smoothing, but not quite identical! # Also note: huge error bands at the right, where there's very little data # Width of GLM confidence bands: plot(0:500,ilogit(logistic.predictions$fit) -ilogit(logistic.predictions$fit-1.96*logistic.predictions$se.fit), type="l",col="blue",xlab="Precipitation today (1/100 inch)", main="Difference in probability between prediction\n and confidence limit for prediction", ylab = expression(paste(Delta,"probability"))) lines(0:500,ilogit(logistic.predictions$fit+1.96*logistic.predictions$se.fit) -ilogit(logistic.predictions$fit)) # Notice that these are asymmetric, and actually narrower at really large # x than at moderate x; why? # Significant mis-specification for the GLM? # Observed difference in deviance diff.dev.obs <- snoq.logistic$deviance - snoq.gam$deviance # Simulate from the fitted logistic regression model for Snoqualmie # Presumes: fitted values of the model are probabilities. snoq.sim <- function(model=snoq.logistic) { fitted.probs <- fitted(model) n <- length(fitted.probs) new.binary <- rbinom(n,size=1,prob=fitted.probs) return(new.binary) } # Simulate from fitted logistic regression, re-fit logistic regression and # GAM, calculate difference in deviances diff.dev <- function(model=snoq.logistic,x=snoq[,2]) { y.new <- snoq.sim(model) GLM.dev <- glm(y.new ~ x,family=binomial)$deviance GAM.dev <- gam(y.new ~ s(x),family=binomial)$deviance return(GLM.dev-GAM.dev) } # The following takes ~1500 seconds on my computer; reduce number of replicates # to taste null.dist.of.diff.dev <- replicate(1000,diff.dev()) p.value <- (1+sum(null.dist.of.diff.dev > diff.dev.obs))/(1+length(null.dist.of.diff.dev)) # p-value is < 10^-3; the observed difference is much bigger than what's # generated by simulating from the logistic model # What went wrong with the logistic model? Inspecting the plots, the spline # and the GAM both predict lower probabilities at X=0 than the GLM does, and # their predicted probabilities increase very steeply there. # GLM's predicted probability of precipitation following a dry day: ilogit(snoq.logistic$coefficients[1]) # which is 0.502 # Actual frequency mean(snoq$tomorrow[snoq$today==0]>0) # which is 0.470 # Since there are a _lot_ of dry days (also about 47%), a couple of percentage # points difference in probability here has a big contribution in the # likelihood. # Therefore: Try telling the GLM that no precipitation today is a special value # Expand the data with an extra column, indicating whether today was dry or # not snoq2 <- data.frame(snoq,dry=ifelse(snoq$today==0,1,0)) # Fit a new logistic model with that dummy variable snoq2.logistic <- glm((tomorrow > 0) ~ today + dry,data=snoq2,family=binomial) # GAM ditto snoq2.gam <- gam((tomorrow > 0) ~ s(today) + dry,data=snoq2,family=binomial) # Plot the new models plot((tomorrow>0)~today,data=snoq,xlab="Precipitation today (1/100 inch)", ylab="Positive precipitation tomorrow?") rug(snoq$today,side=1,col="grey") # Need to add a "dry" column to the fake data for plotting data.plot=data.frame(data.plot,dry=ifelse(data.plot$today==0,1,0)) logistic.predictions2 <- predict(snoq2.logistic,newdata=data.plot,se.fit=TRUE) lines(0:500,ilogit(logistic.predictions2$fit)) lines(0:500,ilogit(logistic.predictions2$fit+1.96*logistic.predictions2$se.fit), lty=2) lines(0:500,ilogit(logistic.predictions2$fit-1.96*logistic.predictions2$se.fit), lty=2) gam.predictions2 <- predict.gam(snoq2.gam,newdata=data.plot,se.fit=TRUE) lines(0:500,ilogit(gam.predictions2$fit),col="blue") lines(0:500,ilogit(gam.predictions2$fit+1.96*gam.predictions2$se.fit), col="blue",lty=2) lines(0:500,ilogit(gam.predictions2$fit-1.96*gam.predictions2$se.fit), col="blue",lty=2) # Notice that the GLM and the GAM now coincide almost exactly in their # predictions. # Look at snoq2.gam to make sure I didn't accidentally forget to turn on # smoothing! summary(snoq2.gam) plot(snoq2.gam) # The "today" term is in fact a non-parametric smooth with a little bit of # curvature # Difference still significant? diff.dev.obs2 <- snoq2.logistic$deviance - snoq2.gam$dev diff.dev.2 <- function(model=snoq2.logistic,x=snoq2[,-1]) { y.new <- snoq.sim(model) sim.data <- data.frame(x,tomorrow=y.new) GLM.dev <- glm(tomorrow ~ today + dry,data=sim.data,family=binomial)$deviance GAM.dev <- gam(tomorrow ~ s(today)+dry,data=sim.data,family=binomial)$deviance return(GLM.dev-GAM.dev) } null.dist.of.diff.dev2 <- replicate(100,diff.dev.2()) p.value <- (1+sum(null.dist.of.diff.dev2 > diff.dev.obs2))/(1+length(null.dist.of.diff.dev2)) # As close to 1 as computation time allows # So let's look at calibration # First, what's the predicted probability of precipitation after dry days? predict(snoq2.logistic,newdata=data.frame(today=0,dry=1),type="response") # Matches the actual frequency to seven decimal places --- it should, we've # introduced a coefficient to do just this! # Look at the histogram of predicted probabilities hist(fitted(snoq2.logistic)) # Big gap between 0.47 and 0.55 # Are we calibrated between say 0.55 and 0.56? mean(snoq$tomorrow[(fitted(snoq2.logistic) >= 0.55) & (fitted(snoq2.logistic) < 0.56)] > 0) # Gives 0.547, pretty reasonable # Write a function to do this less manually frequency.vs.probability <- function(p.lower,p.upper=p.lower+0.01, model=snoq2.logistic, data=(snoq$tomorrow>0)) { # Presume that fitted values are probabilities fitted.probs <- fitted(model) # Which rows of the data have fitted probabilities in our range of interest? indices <- (fitted.probs >= p.lower) & (fitted.probs < p.upper) # For plotting purposes, what's our average predicted probability over this # range? ave.prob <- mean(fitted.probs[indices]) # How big a standard error should we see for binomial data with this predicted # probability? # Rough calculation, could improve by taking account of variation in # predicted probabilities, not reliable if difference between p.lower # and p.upper was substantial se <- sqrt(ave.prob*(1-ave.prob)/sum(indices)) # How often does the event actually happen? # Presumes data is either 0/1 or TRUE/FALSE valued frequency <- mean(data[indices]) out <- list(frequency=frequency,ave.prob=ave.prob,se=se) return(out) } # Look at the whole range of probabilities between 0.55 and 0.74 (because # that's most of our predictions, and there's a gap about 0.75 which is # annoying to deal with) f.vs.p <- sapply((55:74)/100,frequency.vs.probability) # Re-shape the data to remove some R weirdness f.vs.p <- data.frame(frequency=unlist(f.vs.p["frequency",]), ave.prob=unlist(f.vs.p["ave.prob",]), se=unlist(f.vs.p["se",])) # Calibration plot plot(f.vs.p$ave.prob,f.vs.p$frequency,xlim=c(0,1),ylim=c(0,1), xlab="Predicted probabilities",ylab="Observed frequencies") rug(fitted(snoq2.logistic),col="grey") abline(0,1,col="grey") segments(x0=f.vs.p$ave.prob,y0=f.vs.p$ave.prob-1.96*f.vs.p$se, y1=f.vs.p$ave.prob+1.96*f.vs.p$se) # Vertical lines are (approximate) 95% sampling intervals for the frequency, # given the predicted probability and the number of observations