## ----echo=FALSE---------------------------------------------------------- library(knitr) opts_chunk$set(size="small",background="white", highlight=FALSE, cache=TRUE, autdep=TRUE, tidy=TRUE) ## ----initial-data-plot, echo=FALSE--------------------------------------- # Load the data set from the last homework library(gamair) data(chicago) # Plot deaths each day vs. temperature plot(death ~ tmpd, data=chicago, xlab="Temperature (Farenheit)", ylab="Mortality (deaths/day)") # Estimate and store a linear model death.temp.lm <- lm(death ~ tmpd, data=chicago) abline(death.temp.lm, col="blue") ## ----eval=FALSE---------------------------------------------------------- ## # Load the data set from the last homework ## library(gamair) ## data(chicago) ## ## # Plot deaths each day vs. temperature ## plot(death ~ tmpd, data=chicago, ## xlab="Temperature (Farenheit)", ## ylab="Mortality (deaths/day)") ## ## # Estimate and store a linear model ## death.temp.lm <- lm(death ~ tmpd, data=chicago) ## abline(death.temp.lm, col="blue") ## ----residuals-vs-predictor, echo=FALSE---------------------------------- # Always plot residuals vs. predictor variable plot(chicago$tmpd, residuals(death.temp.lm), xlab="Temperature (F)", ylab="Prediction error (deaths/day)") abline(h=0, col="grey") ## ----eval=FALSE---------------------------------------------------------- ## # Always plot residuals vs. predictor variable ## plot(chicago$tmpd, residuals(death.temp.lm), ## xlab="Temperature (F)", ## ylab="Prediction error (deaths/day)") ## abline(h=0, col="grey") ## ----residuals-sq-vs-temperature, echo=FALSE----------------------------- # Always plot residuals^2 vs. # predictor variable plot(chicago$tmpd, residuals(death.temp.lm)^2, xlab="Temperature (F)", ylab="Squared prediction error (deaths/day)^2") abline(h=mean(residuals(death.temp.lm)^2), lwd=2, col="grey") ## ----eval=FALSE---------------------------------------------------------- ## # Always plot residuals^2 vs. ## # predictor variable ## plot(chicago$tmpd, residuals(death.temp.lm)^2, ## xlab="Temperature (F)", ## ylab="Squared prediction error (deaths/day)^2") ## abline(h=mean(residuals(death.temp.lm)^2), ## lwd=2, col="grey") ## ----abs-residuals-vs-temperature, echo=FALSE---------------------------- plot(chicago$tmpd, abs(residuals(death.temp.lm)), xlab="Temperature (F)", ylab="Absolute prediction error (deaths/day)") abline(h=sqrt(mean(residuals(death.temp.lm)^2)), lwd=2, col="grey") ## ----eval=FALSE---------------------------------------------------------- ## plot(chicago$tmpd, abs(residuals(death.temp.lm)), ## xlab="Temperature (F)", ## ylab="Absolute prediction error (deaths/day)") ## abline(h=sqrt(mean(residuals(death.temp.lm)^2)), ## lwd=2, col="grey") ## ----residuals-vs-date, echo=FALSE--------------------------------------- # Always plot residuals vs. coordinate plot(as.Date(chicago$time,origin="1993-12-31"), residuals(death.temp.lm), xlab="Date", ylab="Prediction error (deaths/day)") ## ----eval=FALSE---------------------------------------------------------- ## # Always plot residuals vs. coordinate ## plot(as.Date(chicago$time,origin="1993-12-31"), ## residuals(death.temp.lm), xlab="Date", ## ylab="Prediction error (deaths/day)") ## ----eval=FALSE---------------------------------------------------------- ## sample(residuals(my.model)) ## ----residual-vs-residual, echo=FALSE------------------------------------ # Always plot successive residuals against each other # head() and tail() here are used to get "every day except the last" # and "every day except the first", respectively # see help(head) plot(head(residuals(death.temp.lm),-1), tail(residuals(death.temp.lm),-1), xlab="Residual today", ylab="Residual tomorrow") abline(lm(tail(residuals(death.temp.lm),-1) ~ head(residuals(death.temp.lm),-1))) ## ----eval=FALSE---------------------------------------------------------- ## # Always plot successive residuals against each other ## # head() and tail() here are used to get "every day except the last" ## # and "every day except the first", respectively ## # see help(head) ## plot(head(residuals(death.temp.lm),-1), ## tail(residuals(death.temp.lm),-1), ## xlab="Residual today", ## ylab="Residual tomorrow") ## abline(lm(tail(residuals(death.temp.lm),-1) ~ head(residuals(death.temp.lm),-1))) ## ----histogram-of-residuals, echo=FALSE---------------------------------- # Always plot distribution of residuals hist(residuals(death.temp.lm),breaks=40, freq=FALSE, xlab="Residual (deaths/day)", main="") curve(dnorm(x,mean=0,sd=sd(residuals(death.temp.lm))), add=TRUE,col="blue") ## ----eval=FALSE---------------------------------------------------------- ## # Always plot distribution of residuals ## hist(residuals(death.temp.lm),breaks=40, ## freq=FALSE, ## xlab="Residual (deaths/day)", main="") ## curve(dnorm(x,mean=0,sd=sd(residuals(death.temp.lm))), ## add=TRUE,col="blue") ## ----qqplot-of-residuals, echo=FALSE------------------------------------- # An alternative: plot vs. theoretical Gaussian distribution qqnorm(residuals(death.temp.lm)) qqline(residuals(death.temp.lm)) ## ----eval=FALSE---------------------------------------------------------- ## # An alternative: plot vs. theoretical Gaussian distribution ## qqnorm(residuals(death.temp.lm)) ## qqline(residuals(death.temp.lm)) ## ----set-up-cv----------------------------------------------------------- # Always look at whether the model can extrapolate to # new data # Basic check: randomly divide into two parts, here say 90% of the data # vs. 10% # Use the "training set" to estimate the model training.rows <- sample(1:nrow(chicago), size=round(nrow(chicago)*0.9), replace=FALSE) training.set <- chicago[training.rows,] # We'll use the "testing set" to see how well it does testing.set <- chicago[-training.rows,] # Estimate the model on the training set only training.lm <- lm(death ~ tmpd, data=training.set) # Make predictions on the testing set # The model didn't get to see these points while it was being # estimated, so this really checks (or tests) whether it can # predict testing.preds <- predict(training.lm, newdata=testing.set) # Unfortunately residuals() doesn't know about the new data set # so calculate the residuals by hand testing.residuals <- testing.set$death-testing.preds ## ----cv-residuals-plot,echo=FALSE---------------------------------------- # Plot our residuals against the predictor variable plot(testing.set$tmpd, testing.residuals, xlab="Daily mean temperature (F)", ylab="Prediction error (deaths/day)", main="Out-of-sample residuals") abline(h=0,col="grey") abline(h=mean(testing.residuals),col="red") ## ----eval=FALSE---------------------------------------------------------- ## # Plot our residuals against the predictor variable ## plot(testing.set$tmpd, testing.residuals, ## xlab="Daily mean temperature (F)", ## ylab="Prediction error (deaths/day)", ## main="Out-of-sample residuals") ## abline(h=0,col="grey") ## abline(h=mean(testing.residuals),col="red") ## ----cv-residuals-sq-plot, echo=FALSE------------------------------------ # Plot absolute residuals vs. predictor variable plot(testing.set$tmpd, abs(testing.residuals), xlab="Daily mean temperature (F)", ylab="Absolute prediction error (deaths/day)", main="Out-of-sample absolute residuals") abline(h=sqrt(mean(residuals(training.lm)^2)),col="grey") abline(h=sqrt(mean(testing.residuals^2)),col="red") ## ----eval=FALSE---------------------------------------------------------- ## # Plot absolute residuals vs. predictor variable ## plot(testing.set$tmpd, abs(testing.residuals), ## xlab="Daily mean temperature (F)", ## ylab="Absolute prediction error (deaths/day)", ## main="Out-of-sample absolute residuals") ## abline(h=sqrt(mean(residuals(training.lm)^2)),col="grey") ## abline(h=sqrt(mean(testing.residuals^2)),col="red") ## ----cv-by-temp---------------------------------------------------------- # Find the low-temperature days lowtemp.rows <- which(chicago$tmpd < 75) # About 90% of the data # Divide into low- and high- temperature data sets lowtemp.set <- chicago[lowtemp.rows,] hightemp.set <- chicago[-lowtemp.rows,] # Estimate the model on the colder days only lowtemp.lm <- lm(death ~ tmpd, data=lowtemp.set) # For you: how much do the parameters change, as compared to # using all the data? # Now predict on the high-temperature days # Again, these are new data points, but now systematically # different (because of their temperature) from the # data used to estimate hightemp.preds <- predict(lowtemp.lm, newdata=hightemp.set) # Calculate our own residuals hightemp.residuals <- hightemp.set$death-hightemp.preds ## ----cv-by-temp-plot, echo=FALSE----------------------------------------- # Plot residuals vs. temperature plot(hightemp.set$tmpd, hightemp.residuals, xlab="Daily mean temperature (F)", ylab="Prediction error (deaths/day)", main="Out-of-sample residuals") # Flat line at 0 (ideal, if the model is right) abline(h=0,col="grey") # Flat line at the mean of the new residuals abline(h=mean(hightemp.residuals),col="red") # Regressing the new residuals on temperature does not look good... abline(lm(hightemp.residuals ~ hightemp.set$tmpd),col="purple") ## ----eval=FALSE---------------------------------------------------------- ## # Plot residuals vs. temperature ## plot(hightemp.set$tmpd, hightemp.residuals, ## xlab="Daily mean temperature (F)", ## ylab="Prediction error (deaths/day)", ## main="Out-of-sample residuals") ## # Flat line at 0 (ideal, if the model is right) ## abline(h=0,col="grey") ## # Flat line at the mean of the new residuals ## abline(h=mean(hightemp.residuals),col="red") ## # Regressing the new residuals on temperature does not look good... ## abline(lm(hightemp.residuals ~ hightemp.set$tmpd),col="purple") ## ----abs-cv-by-temp-plot, echo=FALSE------------------------------------- # Similar plots for the absolute residuals plot(hightemp.set$tmpd, abs(hightemp.residuals), xlab="Daily mean temperature (F)", ylab="Absolute prediction error (deaths/day)", main="Out-of-sample absolute residuals") abline(h=sqrt(mean(residuals(lowtemp.lm)^2)),col="grey") abline(h=sqrt(mean(hightemp.residuals^2)),col="red") abline(lm(abs(hightemp.residuals) ~ hightemp.set$tmpd),col="purple") ## ----eval=FALSE---------------------------------------------------------- ## # Similar plots for the absolute residuals ## plot(hightemp.set$tmpd, abs(hightemp.residuals), ## xlab="Daily mean temperature (F)", ## ylab="Absolute prediction error (deaths/day)", ## main="Out-of-sample absolute residuals") ## abline(h=sqrt(mean(residuals(lowtemp.lm)^2)),col="grey") ## abline(h=sqrt(mean(hightemp.residuals^2)),col="red") ## abline(lm(abs(hightemp.residuals) ~ hightemp.set$tmpd),col="purple") ## ----eval=FALSE---------------------------------------------------------- ## smooth.spline(x,y,cv=TRUE) ## ----smooth.spline-demo, echo=FALSE, warning=FALSE----------------------- plot(death ~ tmpd, data=chicago, xlab="Temperature (Farenheit)", ylab="Mortality (deaths/day)") death.temp.ss <- smooth.spline(x=chicago$tmpd, y=chicago$death, cv=TRUE) abline(death.temp.lm, col="blue") lines(death.temp.ss, col="red") ## ----eval=FALSE---------------------------------------------------------- ## plot(death ~ tmpd, data=chicago, ## xlab="Temperature (Farenheit)", ## ylab="Mortality (deaths/day)") ## death.temp.ss <- smooth.spline(x=chicago$tmpd, y=chicago$death, cv=TRUE) ## abline(death.temp.lm, col="blue") ## lines(death.temp.ss, col="red") ## ----smooth.spline-residuals, echo=FALSE, warning=FALSE------------------ plot(chicago$tmpd, residuals(death.temp.lm), xlab="Temperature (F)", ylab="Prediction error (deaths/day)") abline(h=0, col="grey") lines(smooth.spline(x=chicago$tmpd, y=residuals(death.temp.lm), cv=TRUE), col="red") ## ----eval=FALSE---------------------------------------------------------- ## plot(chicago$tmpd, residuals(death.temp.lm), ## xlab="Temperature (F)", ## ylab="Prediction error (deaths/day)") ## abline(h=0, col="grey") ## lines(smooth.spline(x=chicago$tmpd, y=residuals(death.temp.lm), cv=TRUE), ## col="red") ## ----smooth.spline-on-sq-residuals, echo=FALSE, warning=FALSE------------ plot(chicago$tmpd, residuals(death.temp.lm)^2, log="y", xlab="Temperature (F)", ylab="Squared prediction error (deaths/day)^2") abline(h=mean(residuals(death.temp.lm)^2), lwd=2, col="grey") lines(smooth.spline(x=chicago$tmpd, y=residuals(death.temp.lm)^2, cv=TRUE), col="red") ## ----eval=FALSE---------------------------------------------------------- ## plot(chicago$tmpd, residuals(death.temp.lm)^2, log="y", ## xlab="Temperature (F)", ## ylab="Squared prediction error (deaths/day)^2") ## abline(h=mean(residuals(death.temp.lm)^2), ## lwd=2, col="grey") ## lines(smooth.spline(x=chicago$tmpd, y=residuals(death.temp.lm)^2, ## cv=TRUE), ## col="red") ## ----boxcox-of-established-model, echo=FALSE----------------------------- library(MASS) # Works with a previously-fit lm model boxcox(death.temp.lm) ## ----eval=FALSE---------------------------------------------------------- ## library(MASS) ## # Works with a previously-fit lm model ## boxcox(death.temp.lm) ## ----box-cox-on-new-model------------------------------------------------ # You can also give a model formula, and suppress plotting bc.death.temp <- boxcox(death ~ tmpd, data=chicago, plotit=FALSE) # Result is a list showing lambda values ($x component) vs. log-likelihood # (the $y component) # We can use this to get a (rough) maximum value for lambda (lambda.hat <- bc.death.temp$x[which.max(bc.death.temp$y)]) ## ----bc-transf-lm, echo=FALSE-------------------------------------------- chicago$bc.death <- (chicago$death^lambda.hat-1)/lambda.hat bc.lm <- lm(bc.death ~ tmpd, data=chicago) plot(death ~ tmpd, data=chicago, xlab="Temperature (Farenheit)", ylab="Mortality (deaths/day)") points(chicago[,"tmpd"], (lambda.hat*fitted(bc.lm)+1)^(1/lambda.hat), pch=19, col="blue") ## ----eval=FALSE---------------------------------------------------------- ## chicago$bc.death <- (chicago$death^lambda.hat-1)/lambda.hat ## bc.lm <- lm(bc.death ~ tmpd, data=chicago) ## plot(death ~ tmpd, data=chicago, ## xlab="Temperature (Farenheit)", ## ylab="Mortality (deaths/day)") ## points(chicago[,"tmpd"], (lambda.hat*fitted(bc.lm)+1)^(1/lambda.hat), pch=19, ## col="blue")