# In-class demos for 2015-01-20


### Illustration of over-fitting and model selection

# 20 standard-Gaussian X's
x = rnorm(20)
# Quadratic Y's
y = 7*x^2 - 0.5*x + rnorm(20)

# Initial plot of training data plus true regression curve
plot(x,y)
curve(7*x^2-0.5*x,col="grey",add=TRUE)

# Fit polynomials and add them to the plot
# Make a list of model formulae for polynomials of degree 0 to 9
poly.formulae <- c("y~1", paste("y ~ poly(x,", 1:9, ")", sep=""))
  # because "y ~ poly(x,0)" doesn't work
poly.formulae <- sapply(poly.formulae, as.formula)
# Get evenly spaced points for pretty plotting models
df.plot <- data.frame(x=seq(min(x),max(x),length.out=200))
# Fit polynomials of order 0 to 9
  # Store them in a list
fitted.models <- list(length=length(poly.formulae))
for (model_index in 1:length(poly.formulae)) {
  fm <- lm(formula=poly.formulae[[model_index]])
  lines(df.plot$x, predict(fm,newdata=df.plot),lty=model_index)
  fitted.models[[model_index]] <- fm
}
legend("top",legend=paste("degree",0:9),lty=1:10, ncol=2, cex=0.5)

# Calculate and plot in-sample errors
mse.q <- sapply(fitted.models, function(mdl) { mean(residuals(mdl)^2) })
plot(0:9,mse.q,type="b",xlab="polynomial degree",ylab="mean squared error",
     log="y")


# Plot the old curves with testing data
x.new = rnorm(2e4)
y.new = 7*x.new^2 - 0.5*x.new + rnorm(length(x.new))
plot(x.new[1:2e3],y.new[1:2e3],xlab="x",ylab="y",pch=24,cex=0.1,col="blue",
     xlim=2*range(x.new),ylim=2*range(y.new))
curve(7*x^2-0.5*x,col="grey",add=TRUE)
newdf <- data.frame(x=seq(from=2*min(x.new),to=2*max(x.new),length.out=200))
for (model_index in 1:length(fitted.models)) {
  fm <- fitted.models[[model_index]]
  lines(newdf$x, predict(fm, newdf), lty=model_index)
}
points(x,y)
legend("top",legend=paste("degree",0:9),lty=1:10, ncol=2, cex=0.5)

# Calculate and plot the out-of-sample errors

# Calculate an already-fitted model's MSE on new data
  # Inputs: the model object
  # Outputs: the MSE
  # ATTN: Hard-wired to use "x.new" and "y.new" --- can you improve this?
gmse <- function(mdl) {
  predictions <- predict(mdl, data.frame(x=x.new))
  residuals <- y.new - predictions
  return(mean(residuals^2))
}
gmse.q <- sapply(fitted.models, gmse)
plot(0:9,mse.q,type="b",xlab="polynomial degree",
     ylab="mean squared error",log="y",ylim=c(min(mse.q),max(gmse.q)))
lines(0:9,gmse.q,lty=2,col="blue")
points(0:9,gmse.q,pch=24,col="blue")
# Note: logarithmic scale of vertical axis!


# Demonstration: R^2 is an EPIC FAIL as a goodness-of-fit statistic
# So is adjusted R^2

# Extract R^2 and adjusted R^2 from a fitted model
  # Inputs: the model object
  # Output: vector of length 2, R^2 first
extract.rsqd <- function(mdl) {
  c( summary(mdl)$r.squared, summary(mdl)$adj.r.squared)
}
rsqd.q <- sapply(fitted.models, extract.rsqd)
plot(0:9,rsqd.q[1,],type="b",xlab="polynomial degree",ylab=expression(R^2),
  ylim=c(0,1))
lines(0:9,rsqd.q[2,],type="b",lty="dashed")
legend("bottomright",legend=c(expression(R^2),expression(R[adj]^2)),
  lty=c("solid","dashed"))


# General function to do k-fold CV for a bunch of linear models
  # Inputs: dataframe to fit all models on, list or vector of model formulae,
    # number of folds of cross-validation
  # Output: vector of cross-validated MSEs for the models
cv.lm <- function(data, formulae, nfolds=5) {
  # Strip data of NA rows
    # ATTN: Better to check whether NAs are in variables used by the models
  data <- na.omit(data)
  # Make sure the formulae have type "formula"
  formulae <- sapply(formulae, as.formula)
  # Extract the name of the response variable from each formula
    # ATTN: CV doesn't make a lot of sense unless these are all the same!
  responses <- sapply(formulae, response.name)
  names(responses) <- as.character(formulae)
  n <- nrow(data)
  # Assign each data point to a fold, at random
    # see ?sample for the effect of sample(x) on a vector x
  fold.labels <- sample(rep(1:nfolds, length.out=n))
  mses <- matrix(NA, nrow=nfolds, ncol=length(formulae))
  colnames <- as.character(formulae)
  # EXERCISE: Replace the double for() loop below by defining a new
  # function and then calling outer()
  for (fold in 1:nfolds) {
    test.rows <- which(fold.labels == fold)
    train <- data[-test.rows,]
    test <- data[test.rows,]
    for (form in 1:length(formulae)) {
       # Fit the model on the training data
       current.model <- lm(formula=formulae[[form]], data=train)
       # Generate predictions on the testing data
       predictions <- predict(current.model, newdata=test)
       # Get the responses on the testing data
       test.responses <- test[,responses[form]]
       test.errors <- test.responses - predictions
       mses[fold, form] <- mean(test.errors^2)
    }
  }
  return(colMeans(mses))
}

# Extract the name of the response variable from a regression formula
  # Presumes response is the left-most variable
  # EXERCISE: Write a more robust version using terms()
  # Inputs: regression formula
  # Outputs: name of the response variable
response.name <- function(formula) {
  var.names <- all.vars(formula)
  return(var.names[1])
}


# How well does cross-validation work?
  # Remember that our original data had 20 points, and we're seeing how
  # we generalize to 20,000 points from the same distribution
# Make a little data frame out of our little data
little.df <- data.frame(x=x, y=y)
# CV for the polynomials (defaults to five-fold)
cv.q <- cv.lm(little.df, poly.formulae)
plot(0:9,mse.q,type="b",xlab="polynomial degree",
     ylab="mean squared error",log="y",ylim=c(min(mse.q),max(gmse.q)))
lines(0:9,gmse.q,lty=2,col="blue",type="b",pch=2)
lines(0:9,cv.q,lty=3,col="red",type="b",pch=3)
legend("topleft",legend=c("In-sample","Generalization","CV"),
       col=c("black","blue","red"),lty=1:3,pch=1:3)