# In-class demos for 2015-01-20 ### Illustration of over-fitting and model selection # 20 standard-Gaussian X's x = rnorm(20) # Quadratic Y's y = 7*x^2 - 0.5*x + rnorm(20) # Initial plot of training data plus true regression curve plot(x,y) curve(7*x^2-0.5*x,col="grey",add=TRUE) # Fit polynomials and add them to the plot # Make a list of model formulae for polynomials of degree 0 to 9 poly.formulae <- c("y~1", paste("y ~ poly(x,", 1:9, ")", sep="")) # because "y ~ poly(x,0)" doesn't work poly.formulae <- sapply(poly.formulae, as.formula) # Get evenly spaced points for pretty plotting models df.plot <- data.frame(x=seq(min(x),max(x),length.out=200)) # Fit polynomials of order 0 to 9 # Store them in a list fitted.models <- list(length=length(poly.formulae)) for (model_index in 1:length(poly.formulae)) { fm <- lm(formula=poly.formulae[[model_index]]) lines(df.plot$x, predict(fm,newdata=df.plot),lty=model_index) fitted.models[[model_index]] <- fm } legend("top",legend=paste("degree",0:9),lty=1:10, ncol=2, cex=0.5) # Calculate and plot in-sample errors mse.q <- sapply(fitted.models, function(mdl) { mean(residuals(mdl)^2) }) plot(0:9,mse.q,type="b",xlab="polynomial degree",ylab="mean squared error", log="y") # Plot the old curves with testing data x.new = rnorm(2e4) y.new = 7*x.new^2 - 0.5*x.new + rnorm(length(x.new)) plot(x.new[1:2e3],y.new[1:2e3],xlab="x",ylab="y",pch=24,cex=0.1,col="blue", xlim=2*range(x.new),ylim=2*range(y.new)) curve(7*x^2-0.5*x,col="grey",add=TRUE) newdf <- data.frame(x=seq(from=2*min(x.new),to=2*max(x.new),length.out=200)) for (model_index in 1:length(fitted.models)) { fm <- fitted.models[[model_index]] lines(newdf$x, predict(fm, newdf), lty=model_index) } points(x,y) legend("top",legend=paste("degree",0:9),lty=1:10, ncol=2, cex=0.5) # Calculate and plot the out-of-sample errors # Calculate an already-fitted model's MSE on new data # Inputs: the model object # Outputs: the MSE # ATTN: Hard-wired to use "x.new" and "y.new" --- can you improve this? gmse <- function(mdl) { predictions <- predict(mdl, data.frame(x=x.new)) residuals <- y.new - predictions return(mean(residuals^2)) } gmse.q <- sapply(fitted.models, gmse) plot(0:9,mse.q,type="b",xlab="polynomial degree", ylab="mean squared error",log="y",ylim=c(min(mse.q),max(gmse.q))) lines(0:9,gmse.q,lty=2,col="blue") points(0:9,gmse.q,pch=24,col="blue") # Note: logarithmic scale of vertical axis! # Demonstration: R^2 is an EPIC FAIL as a goodness-of-fit statistic # So is adjusted R^2 # Extract R^2 and adjusted R^2 from a fitted model # Inputs: the model object # Output: vector of length 2, R^2 first extract.rsqd <- function(mdl) { c( summary(mdl)$r.squared, summary(mdl)$adj.r.squared) } rsqd.q <- sapply(fitted.models, extract.rsqd) plot(0:9,rsqd.q[1,],type="b",xlab="polynomial degree",ylab=expression(R^2), ylim=c(0,1)) lines(0:9,rsqd.q[2,],type="b",lty="dashed") legend("bottomright",legend=c(expression(R^2),expression(R[adj]^2)), lty=c("solid","dashed")) # General function to do k-fold CV for a bunch of linear models # Inputs: dataframe to fit all models on, list or vector of model formulae, # number of folds of cross-validation # Output: vector of cross-validated MSEs for the models cv.lm <- function(data, formulae, nfolds=5) { # Strip data of NA rows # ATTN: Better to check whether NAs are in variables used by the models data <- na.omit(data) # Make sure the formulae have type "formula" formulae <- sapply(formulae, as.formula) # Extract the name of the response variable from each formula # ATTN: CV doesn't make a lot of sense unless these are all the same! responses <- sapply(formulae, response.name) names(responses) <- as.character(formulae) n <- nrow(data) # Assign each data point to a fold, at random # see ?sample for the effect of sample(x) on a vector x fold.labels <- sample(rep(1:nfolds, length.out=n)) mses <- matrix(NA, nrow=nfolds, ncol=length(formulae)) colnames <- as.character(formulae) # EXERCISE: Replace the double for() loop below by defining a new # function and then calling outer() for (fold in 1:nfolds) { test.rows <- which(fold.labels == fold) train <- data[-test.rows,] test <- data[test.rows,] for (form in 1:length(formulae)) { # Fit the model on the training data current.model <- lm(formula=formulae[[form]], data=train) # Generate predictions on the testing data predictions <- predict(current.model, newdata=test) # Get the responses on the testing data test.responses <- test[,responses[form]] test.errors <- test.responses - predictions mses[fold, form] <- mean(test.errors^2) } } return(colMeans(mses)) } # Extract the name of the response variable from a regression formula # Presumes response is the left-most variable # EXERCISE: Write a more robust version using terms() # Inputs: regression formula # Outputs: name of the response variable response.name <- function(formula) { var.names <- all.vars(formula) return(var.names[1]) } # How well does cross-validation work? # Remember that our original data had 20 points, and we're seeing how # we generalize to 20,000 points from the same distribution # Make a little data frame out of our little data little.df <- data.frame(x=x, y=y) # CV for the polynomials (defaults to five-fold) cv.q <- cv.lm(little.df, poly.formulae) plot(0:9,mse.q,type="b",xlab="polynomial degree", ylab="mean squared error",log="y",ylim=c(min(mse.q),max(gmse.q))) lines(0:9,gmse.q,lty=2,col="blue",type="b",pch=2) lines(0:9,cv.q,lty=3,col="red",type="b",pch=3) legend("topleft",legend=c("In-sample","Generalization","CV"), col=c("black","blue","red"),lty=1:3,pch=1:3)