## ----echo=FALSE---------------------------------------------------------- library(knitr) opts_chunk$set(size="small",background="white", highlight=FALSE, cache=TRUE, autdep=TRUE, tidy=TRUE) ## ------------------------------------------------------------------------ # Simulate from a linear model with uniform X and t-distributed noise # Inputs: number of points; intercept; slope; width of uniform X distribution # (symmetric around 0); degrees of freedom for t # Output: data frame with columns for X and Y sim.linmod <- function(n, beta.0, beta.1, width, df) { # draw n points from a uniform distribution centered on 0 x <- runif(n, min=-width/2, max=width/2) # draw n points from a t distribution with the given number of degrees # of freedom epsilon <- rt(n, df=df) # make y from a linear model y <- beta.0 + beta.1*x + epsilon # return the data frame return(data.frame(x=x, y=y)) } # Calculate in-sample MSE of a linear model # First define a function that works for just one slope/intercept pair at # time # Then "Vectorize" it to handle vectors of intercepts and slopes # Inputs: slope; intercept; data frame with "x" and "y" columns # Output: the in-sample MSE # Presumes: "y" is the target variable and "x" is the predictor mse.insample <- function(b.0, b.1, data) { mean((data$y-(b.0+b.1*data$x))^2) } mse.insample <- Vectorize(mse.insample, vectorize.args=c("b.0","b.1")) # Grids of possible intercepts and slopes b.0.seq <- seq(from=-10,to=10, length.out=20) b.1.seq <- seq(from=-10,to=10, length.out=20) # 3d wire-mesh ("perspective") plot of a linear model's error surface # Input: data set; maximum value for Z axis (for comparability across plots) # Output: Transformation matrix for adding new points/lines to the plot, # invisibly --- see help(persp) under "Value". (Ignored here) # ATTN: hard-coded slope/intercept sequences less than ideal in.sample.persp <- function(data, zmax=600) { # Calculate the in-sample MSE for every combination of z <- outer(b.0.seq, b.1.seq, mse.insample, data=data) persp(b.0.seq, b.1.seq, z, zlim=c(0,zmax), xlab="Intercept", ylab="Slope", zlab="MSE", ticktype="detailed") } ## ----error-surfaces, echo=FALSE------------------------------------------ par(mfrow=c(3,2)) in.sample.persp(sim.linmod(n=10,beta.0=5,beta.1=-2,width=4,df=3)) in.sample.persp(sim.linmod(n=10,beta.0=5,beta.1=-2,width=4,df=3)) in.sample.persp(sim.linmod(n=100,beta.0=5,beta.1=-2,width=4,df=3)) in.sample.persp(sim.linmod(n=100,beta.0=5,beta.1=-2,width=4,df=3)) in.sample.persp(sim.linmod(n=1e5,beta.0=5,beta.1=-2,width=4,df=3)) in.sample.persp(sim.linmod(n=1e5,beta.0=5,beta.1=-2,width=4,df=3)) par(mfrow=c(1,1)) ## ----eval=FALSE---------------------------------------------------------- ## par(mfrow=c(3,2)) ## in.sample.persp(sim.linmod(n=10,beta.0=5,beta.1=-2,width=4,df=3)) ## in.sample.persp(sim.linmod(n=10,beta.0=5,beta.1=-2,width=4,df=3)) ## in.sample.persp(sim.linmod(n=100,beta.0=5,beta.1=-2,width=4,df=3)) ## in.sample.persp(sim.linmod(n=100,beta.0=5,beta.1=-2,width=4,df=3)) ## in.sample.persp(sim.linmod(n=1e5,beta.0=5,beta.1=-2,width=4,df=3)) ## in.sample.persp(sim.linmod(n=1e5,beta.0=5,beta.1=-2,width=4,df=3)) ## par(mfrow=c(1,1)) ## ----scatter-of-regression-lines, echo=FALSE----------------------------- # Create an empty plot (type="n" for "do Nothing") plot(0,type="n",xlim=c(-10,10),ylim=c(-10,10),xlab="x",ylab="y") # Add the true regression line; exaggerate width so it stands out abline(a=5, b=-2, lwd=5) # Repeat 10 times: do a simulation, fit a line to the sim., add the fitted # line to the plot invisible(replicate(20, abline(lm(y ~ x, data=sim.linmod(n=10,beta.0=5, beta.1=-2,width=4,df=3)), col="grey"))) ## ----eval=FALSE---------------------------------------------------------- ## # Create an empty plot (type="n" for "do Nothing") ## plot(0,type="n",xlim=c(-10,10),ylim=c(-10,10),xlab="x",ylab="y") ## # Add the true regression line; exaggerate width so it stands out ## abline(a=5, b=-2, lwd=5) ## # Repeat 10 times: do a simulation, fit a line to the sim., add the fitted ## # line to the plot ## invisible(replicate(20, abline(lm(y ~ x, data=sim.linmod(n=10,beta.0=5, ## beta.1=-2,width=4,df=3)), ## col="grey"))) ## ----eval=FALSE---------------------------------------------------------- ## lm(y ~ x, data=df) ## ------------------------------------------------------------------------ # Make a very small simulated data set from our running examing toy.data <- sim.linmod(n=10, beta.0=5, beta.1=-2, width=4, df=3) # Fit the simple linear regression model to it by least squares lm(y~x, data=toy.data) ## ------------------------------------------------------------------------ names(lm(y~x, data=toy.data)) ## ------------------------------------------------------------------------ # Fit a linear model to the toy data, and store as toy.lm # The name of the estimated model needn't match that of the data, but it's # often a good idea toy.lm <- lm(y~x, data=toy.data) ## ------------------------------------------------------------------------ coefficients(toy.lm) ## ------------------------------------------------------------------------ fitted(toy.lm) ## ------------------------------------------------------------------------ residuals(toy.lm) ## ----toy-model, echo=FALSE----------------------------------------------- plot(y~x, data=toy.data, xlab="x",ylab="y", main="Simulated ('toy') data") abline(toy.lm) ## ----eval=FALSE---------------------------------------------------------- ## plot(y~x, data=toy.data, xlab="x",ylab="y", main="Simulated ('toy') data") ## abline(toy.lm) ## ----eval=FALSE---------------------------------------------------------- ## predict(object, newdata) ## ------------------------------------------------------------------------ predict(toy.lm, newdata=data.frame(x=1:5)) ## ------------------------------------------------------------------------ predict(toy.lm) predict(toy.lm, data=data.frame(x=1:5)) # What's wrong here? ## ------------------------------------------------------------------------ predict(toy.lm, newdata=data.frame(zebra=1:5)) ## ------------------------------------------------------------------------ predict(toy.lm, newdata=data.frame(x=1:5, zebra=6:10)) ## ------------------------------------------------------------------------ predict(toy.lm, newdata=data.frame(x=1:5), se.fit=TRUE)