# Examples of training data # 20 standard-Gaussian X's x = rnorm(20) # Independent Y's y0 = rnorm(20) # Linear Y's y1 = -0.5*x + rnorm(20) # Quadratic Y's y2 = 7*x^2 - 0.5*x + rnorm(20) # Plot the simplest case: independent response plot(x,y0) # Fit a constant, plot it y0.0 = lm(y0 ~ 1) abline(h=y0.0$coefficients[1]) # Get evenly spaced points for pretty plotting d = seq(-2,2,length.out=200) # Fit polynomials of order 1 to 9 for (degree in 1:9) { fm = lm(y0 ~ poly(x,degree)) # Store the results # The assign/paste trick here is often # useful assign(paste("y0",degree,sep="."), fm) # Plot them, with different line types lines(d, predict(fm,data.frame(x=d)),lty=(degree+1)) } # Calculate and plot in-sample mean-squared error mse = vector(length=10) for (degree in 0:9) { # The get() function is the inverse to assign() fm = get(paste("y0",degree,sep=".")) mse[degree+1] = mean(summary(fm)$residuals^2) } plot(0:9,mse,type="b",xlab="polynomial degree", ylab="mean squared error") # Create testing data --- substantially bigger # than the training set, so we can treat this as # fairly representative of how well it would work. # If 200 data points seems too small, you could # easily use 2 million. Actually, in this case # you could _calculate_ exactly what # would happen, using the properties of the # Gaussian distribution, but that's too much of # a pain for a special case. x.new = rnorm(200) y0.new = rnorm(200) # Calculate the out-of-sample errors gmse = vector(length=10) for (degree in 0:9) { # Retrieve the stored fitted models fm = get(paste("y0",degree,sep=".")) # Predict on new data predictions = predict(fm,data.frame(x=x.new)) # Calculate residuals residuals = y0.new - predictions # square and average gmse[degree+1] = mean(residuals^2) } # Plot: lines and points lines(0:9,gmse,col="blue",lty=2) points(0:9,gmse,pch=24,col="blue") # Calculate the OOS errors on data within the # original range range.ok = (x.new <= max(x)) & (x.new >= min(x)) gmse.restricted = vector(length=10) for (degree in 0:9) { fm = get(paste("y0",degree,sep=".")) predictions = predict(fm,data.frame(x=x.new[range.ok])) residuals = y0.new[range.ok] - predictions gmse.restricted[degree+1] = mean(residuals^2) } lines(0:9,gmse.restricted,col="blue",lty=2) points(0:9,gmse.restricted,pch=24,col="blue") # Quadratic example plot(x,y2) curve(7*x^2-0.5*x,col="grey",add=TRUE) # Fit a constant, plot it y2.0 = lm(y2 ~ 1) abline(h=y2.0$coefficients[1]) # Get evenly spaced points for pretty plotting d = seq(-2,2,length.out=200) # Fit polynomials of order 1 to 9 for (degree in 1:9) { fm = lm(y2 ~ poly(x,degree)) # Store the results # The assign/paste trick here is often # useful assign(paste("y2",degree,sep="."), fm) # Plot them, with different line types lines(d, predict(fm,data.frame(x=d)),lty=(degree+1)) } # Plot with testing data y2.new = 7*x.new^2 - 0.5*x.new + rnorm(200) plot(x,y2,xlim=range(x.new),ylim=range(y2.new)) curve(7*x^2-0.5*x,col="grey",add=TRUE) y2.0 = lm(y2 ~ 1) abline(h=y2.0$coefficients[1]) d = seq(-4,4,length.out=200) for (degree in 1:9) { fm = get(paste("y2",degree,sep=".")) lines(d, predict(fm,data.frame(x=d)),lty=(degree+1)) } points(x.new,y2.new,col="blue",pch=24) # Calculate and plot in- and out-of- sample # errors for the quadratic case mse.q = vector(length=10) gmse.q = vector(length=10) for (degree in 0:9) { # The get() function is the inverse to assign() fm = get(paste("y2",degree,sep=".")) mse.q[degree+1] = mean(summary(fm)$residuals^2) predictions = predict(fm,data.frame(x=x.new)) resids = y2.new - predictions gmse.q[degree+1] = mean(resids^2) } plot(0:9,mse.q,type="b",xlab="polynomial degree", ylab="mean squared error", ylim=c(0,50)) lines(0:9,gmse.q,lty=2,col="blue") points(0:9,gmse.q,pch=24,col="blue")