# In-class demos for 2019-01-24
# Curve-estimation by local averaging
# Effects of changing bandwidth (= range of local averages) on the estimated
# curve
# Picking the right amount of averaging by cross-validation
# Set up the two true regression functions: one a sinusoid (which oscillates),
# the other logarithmic (monotone, with a decreasing slope)
true.r <- function(x) {
sin(x)*cos(20*x)}
true.s <- function(x) {
log(x+1)}
curve(true.r(x),
from=0,to=3,xlab="x",
ylab=expression(r(x)))
curve(true.s(x),from=0,to=3,
xlab="x",ylab=expression(s(x)))
# Generate random values where these are the true regression functions
# x uniformly distributed, y = regression function + IID Gaussian noise
# Gaussian bit not essential, magnitude (=std. dev.) of noise not essential
x = runif(300,0,3)
yr = true.r(x)+rnorm(length(x),0,0.15)
ys = true.s(x)+rnorm(length(x),0,0.15)
# Plot the simulated data, and overlay the true regression function
# Obviously in a real-data problem we don't get to see the grey curves!
plot(x,yr,xlab="x",ylab=expression(r(x)+epsilon))
curve(true.r(x),col="grey",add=TRUE)
plot(x,ys,xlab="x",ylab=expression(s(x)+eta))
curve(true.s(x),col="grey",add=TRUE)
# Effects of local averaging
# Let's true to estimate mu(1.6) by averaging
# values of y where the x is close to 1.6
# Set up the initial range, and re-plot the data to
# highlight the values we will include in the average
x.focus <- 1.6; x.lo <- x.focus-0.1; x.hi <- x.focus+0.1
colors=ifelse((xx.lo),"black","grey")
plot(x,yr,xlab="x",
ylab=expression(r(x)+epsilon),
col=colors)
curve(true.r(x),col="grey",add=TRUE)
# Now put a point showing x=1.6 and the average of the highlighted y values
points(x.focus,
mean(yr[(xx.lo)]),
pch=18,cex=2)
# Ditto for the other data set
plot(x,ys,xlab="x",
ylab=expression(s(x)+eta),
col=colors)
curve(true.s(x),col="grey",add=TRUE)
points(x.focus,
mean(ys[(xx.lo)]),
pch=18,cex=2)
# How does the error change as we change the width of the averaging window?
loc_ave_err <- function(h,y,y0) {
abs(y0-mean(y[(x.focus-h < x) &
(x.focus+h>x)]))}
yr0=true.r(x.focus)
ys0=true.s(x.focus)
r.LAE = sapply(1:100/100,
loc_ave_err,
y=yr,y0=yr0)
s.LAE = sapply(1:100/100,
loc_ave_err,
y=ys,y0=ys0)
plot(1:100/100,r.LAE,
xlab="Radius of averaging window",ylim=c(0,1.1),
ylab="Absolute value of error",type="l",log="x")
lines(1:100/100,s.LAE,lty="dashed")
abline(h=0.15,col="grey")
# Demo of bandwidth selection by cross-validation for 1-D kernel smoothing
# ATTN: This is JUST a demo of the concept
# YOU SHOULD NOT USE THIS IN HOMEWORK (unless specifically told to)
# It's not very fast, it's not very flexible, and it only handles 1D
# In general, use the built-in bandwidth selector from the np package
# Inputs: vector of regressor values (x)
# vector of regressand values (y)
# vector of bandwidths (bandwidths; default values pretty arbitrary)
# number of folds of cross-validation (nfolds)
# Output: list with components "bwest.bw" (number, best bandwidth),
# "CV_MSEs" (numeric vector, CV scores of all bandwidths),
# "fold_MSEs" (numeric matrix, score of each bandwidth on each fold)
# Presumes: np package is installed
cv_bws_npreg <- function(x,y,
bandwidths=(1:50)/50,
nfolds=10) {
# Load the np package (if it's not loaded already)
require(np)
# How many data points?
n <- length(x)
# Sanity checks: more than one observation, equal number of x and y values...
stopifnot(n > 1, length(y) == n)
# ... more than one bandwidth...
stopifnot(length(bandwidths) > 1)
# ... at least one fold, fold number is an integer
stopifnot(nfolds > 0, nfolds==trunc(nfolds))
# Prepare a matrix to store fold-by-bandwidth out-of-sample errors
fold_MSEs <- matrix(0,
nrow=nfolds,
ncol=length(bandwidths))
# Number the columns by the bandwidths
colnames(fold_MSEs) = bandwidths
# Start by assigning each row to a fold, cycling through them
# (e.g., as 1, 2, 3, 1, 2, 3, 1, 2, 3, ...) if nfolds==3
# Then randomly permute the ordering
case.folds <- sample(rep(1:nfolds,length.out=n))
# Thus all folds have an equal number of random data points
# Cycle through the folds
for (fold in 1:nfolds) {
# Everything not in the current fold is in the training set
train.rows = which(case.folds!=fold)
# Subset the data into training and testing sets
x.train = x[train.rows]
y.train = y[train.rows]
x.test = x[-train.rows]
y.test = y[-train.rows]
# For each bandwidth,
for (bw in bandwidths) {
# Fit the model on the training set, and evaluate it on the testing test
fit <- npreg(txdat=x.train,tydat=y.train,
exdat=x.test,eydat=y.test,bws=bw)
# The $MSE component of the fit is calculated on the evaluation set, if
# it's given one, so we don't need to explicitly call predict() and
# calculate diffrences here.
# Exercise: If npreg didn't have this feature, how would you use predict?
fold_MSEs[fold,paste(bw)] <- fit$MSE
}
}
# Take the average MSE for each bandwidth, across folds
CV_MSEs = colMeans(fold_MSEs)
# The best bandwidth is the one with the smallest CV'd error
best.bw = bandwidths[which.min(CV_MSEs)]
return(list(best.bw=best.bw,CV_MSEs=CV_MSEs,
fold_MSEs=fold_MSEs))
}
rbws <- cv_bws_npreg(x,yr,bandwidths=(1:100)/200)
sbws <- cv_bws_npreg(x,ys,bandwidths=(1:100)/200)
plot(1:100/200,sqrt(rbws$CV_MSEs),xlab="Bandwidth",
ylab="Root CV MSE",type="l",ylim=c(0,0.6),
log="x")
lines(1:100/200,sqrt(sbws$CV_MSEs),lty="dashed")
abline(h=0.15,col="grey")
legend("topleft", legend=c("Sinusoid", "Logarithmic", "Std. dev. of noise"),
lty=c("solid","dashed","solid"),
col=c("black","black","grey"))
# EXERCISE:
# What (approximately) is the best bandwidth for
# the sinusoid (r(x)) data? (solid line in fig.)
# What (approximately) is the best bandwidth for
# the logarithmic (s(x)) data? (dashed line in fig.)
# Why is a smaller bandwidth better for the sinusoid
# than for the logarithmic curve?
# How well do the selected bandwidths actually work?
x.ord=order(x)
par(mfcol=c(2,1))
plot(x,yr,xlab="x",ylab=expression(r(x)+epsilon))
rhat <- npreg(bws=rbws$best.bw,txdat=x,tydat=yr)
lines(x[x.ord],fitted(rhat)[x.ord],lwd=4)
curve(true.r(x),col="grey",add=TRUE,lwd=2)
plot(x,ys,xlab="x",ylab=expression(s(x)+eta))
shat <- npreg(bws=sbws$best.bw,txdat=x,tydat=ys)
lines(x[x.ord],fitted(shat)[x.ord],lwd=4)
curve(true.s(x),col="grey",add=TRUE,lwd=2)
par(mfcol=c(1,1))