# In-class demos for the lecture on the bootstrap, 36-402, Spring 2012 library(MASS) data(cats) summary(cats) # Is Moonshine over-weight? # Find 95th percentile of weight, assuming a Gaussian distribution of body # weights, matched to data (q95.gaussian <- qnorm(0.95,mean=mean(cats\$Bwt),sd=sd(cats\$Bwt))) ### How precisely do I know this? # Simulate from the fitted Gaussian rcats.gaussian <- function() { rnorm(n=nrow(cats),mean=mean(cats\$Bwt),sd=sd(cats\$Bwt)) } # Replicate the initial estimation on some surrogate data est.q95.gaussian <- function(x) { m <- mean(x) s <- sd(x) return(qnorm(0.95,mean=m,sd=s)) } # R exercise for the reader: convince yourself that this works properly # on the original data set # Draw 1000 surrogate data sets and repeat the estimation on each to get # an approximation to the sampling distribution sampling.dist.gaussian <- replicate(1000, est.q95.gaussian(rcats.gaussian())) # Plot the sampling distribution: histogram, smoothed density estimate plot(hist(sampling.dist.gaussian,breaks=50),freq=FALSE) plot(density(sampling.dist.gaussian)) abline(v=q95.gaussian,lty=2) # Add original value # Find measures of uncertainty from the 1000 bootstrap replicates # Standard error of the estimate: sd(sampling.dist.gaussian) # 95% confidence interval for estimate (crude): quantile(sampling.dist.gaussian,c(0.025,0.975)) # Better confidence intervals (see notes): 2*q95.gaussian - quantile(sampling.dist.gaussian,c(0.975,0.025)) # Is the Gaussian distribution assumption a good one? plot(hist(cats\$Bwt),freq=FALSE) lines(density(cats\$Bwt),lty=2) curve(dnorm(x,mean=mean(cats\$Bwt),sd=sd(cats\$Bwt)),add=TRUE,col="purple") # Not very bell-curve looking. # Is Moonshine over-weight?, take two # Direct or empirical, non-parametric estimate of 95th percentile (q95.np <- quantile(cats\$Bwt,0.95)) # How uncertain is this? resample <- function(x) { sample(x,size=length(x),replace=TRUE) } est.q95.np <- function(x) { quantile(x,0.95) } # R exercise for the reader: convince yourself that this works properly # on the original data set sampling.dist.np <- replicate(1000, est.q95.np(resample(cats\$Bwt))) plot(density(sampling.dist.np)) abline(v=q95.np,lty=2) # Standard error of the estimate: sd(sampling.dist.np) # Bias of the estimate: mean(sampling.dist.np - q95.np) # Crude 95% CI: quantile(sampling.dist.np,c(0.025,0.975)) # More refined CI: 2*q95.np - quantile(sampling.dist.np,c(0.975,0.025)) # Relating heart weight (in grams) to body weight (in kilograms): plot(Hwt~Bwt, data=cats, xlab="Body weight (kg)", ylab="Heart weight (g)") cats.lm <- lm(Hwt ~ Bwt, data=cats) abline(cats.lm) coefficients(cats.lm) confint(cats.lm) plot(cats\$Bwt,residuals(cats.lm)) plot(density(residuals(cats.lm))) # CIs by resampling cases: # First re-fit on a subset of the data coefs.cats.lm <- function(subset) { fit <- lm(Hwt~Bwt,data=cats,subset=subset) return(coefficients(fit)) } # R exercise for the reader: convince yourself that this works properly # when given all the rows # R exercise for the reader: convince yourself that if the subset vector # contains the same index multiple times, this actually, and appropriately, # changes the result # Now do on many random subsets cats.lm.sampling.dist <- replicate(1000, coefs.cats.lm(resample(1:nrow(cats)))) (limits <- apply(cats.lm.sampling.dist,1,quantile,c(0.025,0.975))) # Noticeably broader # R exercise for the reader: get the more refined limits out of this