## ----echo=FALSE---------------------------------------------------------- library(knitr) opts_chunk$set(size="small",background="white", highlight=FALSE, cache=TRUE, autdep=TRUE, tidy=TRUE) ## ----eval=FALSE---------------------------------------------------------- ## anova(lm(y~x)) ## ------------------------------------------------------------------------ library(gamair); data(chicago) death.temp.lm <- lm(death ~ tmpd, data=chicago) anova(death.temp.lm) ## ------------------------------------------------------------------------ # Simulate a Gaussian-noise simple linear regression model # Inputs: x sequence; intercept; slope; noise variance; switch for whether to # return the simulated values, or the ratio of s^2_Y/\hat{\sigma}^2 # Output: data frame or coefficient vector sim.gnslrm <- function(x, intercept, slope, sigma.sq, var.ratio=TRUE) { n <- length(x) y <- intercept + slope*x + rnorm(n,mean=0,sd=sqrt(sigma.sq)) if (var.ratio) { mdl <- lm(y~x) hat.sigma.sq <- mean(residuals(mdl)^2) # R uses the n-1 denominator in var(), but we need the MLE s.sq.y <- var(y)*(n-1)/n return(s.sq.y/hat.sigma.sq) } else { return(data.frame(x=x, y=y)) } } # Parameters beta.0 <- 5 beta.1 <- 0 # We are simulating under the null! sigma.sq <- 0.1 x.seq <- seq(from=-5, to=5, length.out=42) ## ----f-test-sims, echo=FALSE--------------------------------------------- # Run a bunch of simulations under the null and get all the F statistics # Actual F statistic is in the 4th column of the output of anova() f.stats <- replicate(1000, anova(lm(y~x, data= sim.gnslrm(x.seq, beta.0, beta.1, sigma.sq, FALSE)))[1,4]) # Store histogram of the F statistics, but hold off on plotting it null.hist <- hist(f.stats, breaks=50, plot=FALSE) # Run a bunch of simulations under the alternative and get all the F statistics alt.f <- replicate(1000, anova(lm(y~x, data=sim.gnslrm(x.seq, beta.0, -0.05, sigma.sq, FALSE)))[1,4]) # Store a histogram of this, but again hold off on plotting alt.hist <- hist(alt.f, breaks=50, plot=FALSE) # Create an empty plot plot(0, xlim=c(0,30), ylim=c(0,1.2), xlab="F", ylab="Density", type="n") # Add the histogram of F under the alternative, then under the null plot(alt.hist, freq=FALSE, add=TRUE, col="grey", border="grey") plot(null.hist, freq=FALSE, add=TRUE) # Finally, the theoretical F distribution curve(df(x,1,length(x.seq)-2), add=TRUE, col="blue") ## ----eval=FALSE---------------------------------------------------------- ## # Run a bunch of simulations under the null and get all the F statistics ## # Actual F statistic is in the 4th column of the output of anova() ## f.stats <- replicate(1000, anova(lm(y~x, data= sim.gnslrm(x.seq, beta.0, beta.1, ## sigma.sq, FALSE)))[1,4]) ## # Store histogram of the F statistics, but hold off on plotting it ## null.hist <- hist(f.stats, breaks=50, plot=FALSE) ## # Run a bunch of simulations under the alternative and get all the F statistics ## alt.f <- replicate(1000, anova(lm(y~x, data=sim.gnslrm(x.seq, beta.0, -0.05, ## sigma.sq, FALSE)))[1,4]) ## # Store a histogram of this, but again hold off on plotting ## alt.hist <- hist(alt.f, breaks=50, plot=FALSE) ## # Create an empty plot ## plot(0, xlim=c(0,30), ylim=c(0,1.2), xlab="F", ylab="Density", type="n") ## # Add the histogram of F under the alternative, then under the null ## plot(alt.hist, freq=FALSE, add=TRUE, col="grey", border="grey") ## plot(null.hist, freq=FALSE, add=TRUE) ## # Finally, the theoretical F distribution ## curve(df(x,1,length(x.seq)-2), add=TRUE, col="blue") ## ----f-test-pvals, echo=FALSE-------------------------------------------- # Take the simulated F statistics and convert to p-values p.vals <- pf(f.stats, 1, length(x.seq)-2, lower.tail=FALSE) alt.p <- pf(alt.f, 1, length(x.seq)-2, lower.tail=FALSE) hist(alt.p, col="grey", freq=FALSE, xlab="p-value", main="", border="grey", xlim=c(0,1)) plot(hist(p.vals, plot=FALSE), add=TRUE, freq=FALSE) ## ----eval=FALSE---------------------------------------------------------- ## # Take the simulated F statistics and convert to p-values ## p.vals <- pf(f.stats, 1, length(x.seq)-2, lower.tail=FALSE) ## alt.p <- pf(alt.f, 1, length(x.seq)-2, lower.tail=FALSE) ## hist(alt.p, col="grey", freq=FALSE, xlab="p-value", main="", border="grey", ## xlim=c(0,1)) ## plot(hist(p.vals, plot=FALSE), add=TRUE, freq=FALSE) ## ----lrt-illustrated, echo=FALSE----------------------------------------- # Simulate from the model 1000 times, capturing the variance ratios var.ratios <- replicate(1000, sim.gnslrm(x.seq,beta.0,beta.1,sigma.sq)) # Convert variance ratios into log likelihood-ratios LLRs <- (length(x.seq)/2)*log(var.ratios) # Create a histogram of 2*LLR hist(2*LLRs, breaks=50, freq=FALSE, xlab=expression(2*Lambda), main="") # Add the theoretical chi^2_1 distribution curve(dchisq(x,df=1), col="blue", add=TRUE) ## ----eval=FALSE---------------------------------------------------------- ## # Simulate from the model 1000 times, capturing the variance ratios ## var.ratios <- replicate(1000, sim.gnslrm(x.seq,beta.0,beta.1,sigma.sq)) ## # Convert variance ratios into log likelihood-ratios ## LLRs <- (length(x.seq)/2)*log(var.ratios) ## # Create a histogram of 2*LLR ## hist(2*LLRs, breaks=50, freq=FALSE, xlab=expression(2*Lambda), main="") ## # Add the theoretical chi^2_1 distribution ## curve(dchisq(x,df=1), col="blue", add=TRUE) ## ----a-bad-regression---------------------------------------------------- # Simulate from a non-linear, non-constant-variance model # Curve: Y = (X-1)^2 * U # U ~ Unif(0.8, 1.2) # X ~ Exp(0.5) # Inputs: number of data points; whether to return data frame or F test of # a simple linear model sim.non.slr <- function(n, do.test=FALSE) { x <- rexp(n,rate=0.5) y <- (x-1)^2 * runif(n, min=0.8, max=1.2) if (! do.test) { return(data.frame(x=x,y=y)) } else { # Fit a linear model, run F test, return p-value return(anova(lm(y~x))[["Pr(>F)"]][1]) } } ## ----sim-a-bad-regression, echo=FALSE------------------------------------ not.slr <- sim.non.slr(n=200) plot(y~x, data=not.slr) curve((x-1)^2, col="blue", add=TRUE) abline(lm(y~x,data=not.slr), col="red") ## ----eval=FALSE---------------------------------------------------------- ## not.slr <- sim.non.slr(n=200) ## plot(y~x, data=not.slr) ## curve((x-1)^2, col="blue", add=TRUE) ## abline(lm(y~x,data=not.slr), col="red") ## ----f-test-p-values-mean-little, echo=FALSE----------------------------- f.tests <- replicate(1e4, sim.non.slr(n=200, do.test=TRUE)) hist(log10(f.tests),breaks=30,freq=FALSE, main="", xlab="log (base ten) of p value") ## ----eval=FALSE---------------------------------------------------------- ## f.tests <- replicate(1e4, sim.non.slr(n=200, do.test=TRUE)) ## hist(log10(f.tests),breaks=30,freq=FALSE, main="", ## xlab="log (base ten) of p value")