## ----echo=FALSE----------------------------------------------------------
library(knitr)
opts_chunk$set(size="small",background="white", highlight=FALSE, cache=TRUE, autdep=TRUE, tidy=TRUE)

## ----eval=FALSE----------------------------------------------------------
## anova(lm(y~x))

## ------------------------------------------------------------------------
library(gamair); data(chicago)
death.temp.lm <- lm(death ~ tmpd, data=chicago)
anova(death.temp.lm)

## ------------------------------------------------------------------------
# Simulate a Gaussian-noise simple linear regression model
# Inputs: x sequence; intercept; slope; noise variance; switch for whether to
  # return the simulated values, or the ratio of s^2_Y/\hat{\sigma}^2
# Output: data frame or coefficient vector
sim.gnslrm <- function(x, intercept, slope, sigma.sq, var.ratio=TRUE) {
  n <- length(x)
  y <- intercept + slope*x + rnorm(n,mean=0,sd=sqrt(sigma.sq))
  if (var.ratio) {
    mdl <- lm(y~x)
    hat.sigma.sq <- mean(residuals(mdl)^2)
    # R uses the n-1 denominator in var(), but we need the MLE
    s.sq.y <- var(y)*(n-1)/n
    return(s.sq.y/hat.sigma.sq)
  } else {
    return(data.frame(x=x, y=y))
  }
}

# Parameters
beta.0 <- 5
beta.1 <- 0  # We are simulating under the null!
sigma.sq <- 0.1
x.seq <- seq(from=-5, to=5, length.out=42)

## ----f-test-sims, echo=FALSE---------------------------------------------
# Run a bunch of simulations under the null and get all the F statistics
  # Actual F statistic is in the 4th column of the output of anova()
f.stats <- replicate(1000, anova(lm(y~x, data= sim.gnslrm(x.seq, beta.0, beta.1,
                                               sigma.sq, FALSE)))[1,4])
# Store histogram of the F statistics, but hold off on plotting it
null.hist <- hist(f.stats, breaks=50, plot=FALSE)
# Run a bunch of simulations under the alternative and get all the F statistics
alt.f <- replicate(1000, anova(lm(y~x, data=sim.gnslrm(x.seq, beta.0, -0.05,
                                           sigma.sq, FALSE)))[1,4])
# Store a histogram of this, but again hold off on plotting
alt.hist <- hist(alt.f, breaks=50, plot=FALSE)
# Create an empty plot
plot(0, xlim=c(0,30), ylim=c(0,1.2), xlab="F", ylab="Density", type="n")
# Add the histogram of F under the alternative, then under the null
plot(alt.hist, freq=FALSE, add=TRUE, col="grey", border="grey")
plot(null.hist, freq=FALSE, add=TRUE)
# Finally, the theoretical F distribution
curve(df(x,1,length(x.seq)-2), add=TRUE, col="blue")

## ----eval=FALSE----------------------------------------------------------
## # Run a bunch of simulations under the null and get all the F statistics
##   # Actual F statistic is in the 4th column of the output of anova()
## f.stats <- replicate(1000, anova(lm(y~x, data= sim.gnslrm(x.seq, beta.0, beta.1,
##                                                sigma.sq, FALSE)))[1,4])
## # Store histogram of the F statistics, but hold off on plotting it
## null.hist <- hist(f.stats, breaks=50, plot=FALSE)
## # Run a bunch of simulations under the alternative and get all the F statistics
## alt.f <- replicate(1000, anova(lm(y~x, data=sim.gnslrm(x.seq, beta.0, -0.05,
##                                            sigma.sq, FALSE)))[1,4])
## # Store a histogram of this, but again hold off on plotting
## alt.hist <- hist(alt.f, breaks=50, plot=FALSE)
## # Create an empty plot
## plot(0, xlim=c(0,30), ylim=c(0,1.2), xlab="F", ylab="Density", type="n")
## # Add the histogram of F under the alternative, then under the null
## plot(alt.hist, freq=FALSE, add=TRUE, col="grey", border="grey")
## plot(null.hist, freq=FALSE, add=TRUE)
## # Finally, the theoretical F distribution
## curve(df(x,1,length(x.seq)-2), add=TRUE, col="blue")

## ----f-test-pvals, echo=FALSE--------------------------------------------
# Take the simulated F statistics and convert to p-values
p.vals <- pf(f.stats, 1, length(x.seq)-2, lower.tail=FALSE)
alt.p <- pf(alt.f, 1, length(x.seq)-2, lower.tail=FALSE)
hist(alt.p, col="grey", freq=FALSE, xlab="p-value", main="", border="grey",
     xlim=c(0,1))
plot(hist(p.vals, plot=FALSE), add=TRUE, freq=FALSE)

## ----eval=FALSE----------------------------------------------------------
## # Take the simulated F statistics and convert to p-values
## p.vals <- pf(f.stats, 1, length(x.seq)-2, lower.tail=FALSE)
## alt.p <- pf(alt.f, 1, length(x.seq)-2, lower.tail=FALSE)
## hist(alt.p, col="grey", freq=FALSE, xlab="p-value", main="", border="grey",
##      xlim=c(0,1))
## plot(hist(p.vals, plot=FALSE), add=TRUE, freq=FALSE)

## ----lrt-illustrated, echo=FALSE-----------------------------------------
# Simulate from the model 1000 times, capturing the variance ratios
var.ratios <- replicate(1000, sim.gnslrm(x.seq,beta.0,beta.1,sigma.sq))
# Convert variance ratios into log likelihood-ratios
LLRs <- (length(x.seq)/2)*log(var.ratios)
# Create a histogram of 2*LLR
hist(2*LLRs, breaks=50, freq=FALSE, xlab=expression(2*Lambda), main="")
# Add the theoretical chi^2_1 distribution
curve(dchisq(x,df=1), col="blue", add=TRUE)

## ----eval=FALSE----------------------------------------------------------
## # Simulate from the model 1000 times, capturing the variance ratios
## var.ratios <- replicate(1000, sim.gnslrm(x.seq,beta.0,beta.1,sigma.sq))
## # Convert variance ratios into log likelihood-ratios
## LLRs <- (length(x.seq)/2)*log(var.ratios)
## # Create a histogram of 2*LLR
## hist(2*LLRs, breaks=50, freq=FALSE, xlab=expression(2*Lambda), main="")
## # Add the theoretical chi^2_1 distribution
## curve(dchisq(x,df=1), col="blue", add=TRUE)

## ----a-bad-regression----------------------------------------------------
# Simulate from a non-linear, non-constant-variance model
  # Curve: Y = (X-1)^2 * U
  # U ~ Unif(0.8, 1.2)
  # X ~ Exp(0.5)
# Inputs: number of data points; whether to return data frame or F test of
  # a simple linear model

sim.non.slr <- function(n, do.test=FALSE) {
    x <- rexp(n,rate=0.5)
    y <- (x-1)^2 * runif(n, min=0.8, max=1.2)
    if (! do.test) {
        return(data.frame(x=x,y=y))
    } else {
        # Fit a linear model, run F test, return p-value
        return(anova(lm(y~x))[["Pr(>F)"]][1])
    }
}

## ----sim-a-bad-regression, echo=FALSE------------------------------------
not.slr <- sim.non.slr(n=200)
plot(y~x, data=not.slr)
curve((x-1)^2, col="blue", add=TRUE)
abline(lm(y~x,data=not.slr), col="red")

## ----eval=FALSE----------------------------------------------------------
## not.slr <- sim.non.slr(n=200)
## plot(y~x, data=not.slr)
## curve((x-1)^2, col="blue", add=TRUE)
## abline(lm(y~x,data=not.slr), col="red")

## ----f-test-p-values-mean-little, echo=FALSE-----------------------------
f.tests <- replicate(1e4, sim.non.slr(n=200, do.test=TRUE))
hist(log10(f.tests),breaks=30,freq=FALSE, main="",
     xlab="log (base ten) of p value")

## ----eval=FALSE----------------------------------------------------------
## f.tests <- replicate(1e4, sim.non.slr(n=200, do.test=TRUE))
## hist(log10(f.tests),breaks=30,freq=FALSE, main="",
##      xlab="log (base ten) of p value")