\documentclass{article}
\input{../common}

\newcommand{\RidgeEstimator}{\tilde{\mathbf{\beta}}_{\lambda}}
\begin{document}
\title{Lecture 18: Tests and Confidence Sets for Multiple Coefficients}
\author{36-401, Fall 2015, Section B}
\date{27 October 2015}
\maketitle


\tableofcontents

<< echo=FALSE >>=
library(knitr)
opts_chunk$set(size='small', background='white', cache=TRUE, autodep=TRUE,
               options(show.signif.stars=FALSE))
@

Throughout, we'll assume that the Gaussian-noise multiple linear regression
model
\begin{equation}
Y = \beta_0 + \beta_1 X_1 + \ldots + \beta_p X_p + \epsilon
\end{equation}
with $\epsilon \sim N(0,\sigma^2)$ independent of the $X_i$s and independent
across observations, is completely correct.
 We will also use the least squares
or maximum likelihood estimates of the slopes,
\begin{equation}
\widehat{\mathbf{\beta}} = (\mathbf{x}^T\mathbf{x})^{-1}\mathbf{x}^T\mathbf{y}
\end{equation}
Under these assumptions, the estimator has a multivariate Gaussian distribution,
\begin{equation}
\widehat{\mathbf{\beta}} \sim MVN(\mathbf{\beta}, \sigma^2 (\mathbf{x}^T\mathbf{x})^{-1})
\end{equation}
The maximum likelihood estimate of $\sigma^2$, $\hat{\sigma}^2$, is
by
\begin{equation}
\hat{\sigma}^2 = \frac{1}{n}(\mathbf{y} - \mathbf{x}\widehat{\mathbf{\beta}})^T(\mathbf{y} - \mathbf{x}\widehat{\mathbf{\beta}})
\end{equation}
This is slightly negatively biased, $\Expect{\hat{\sigma}^2} = \frac{n-p-1}{n}\sigma^2$, and has the sampling distribution
\begin{equation}
\frac{n\hat{\sigma}^2}{\sigma^2} \sim \chi^2_{n-p-1}
\end{equation}
$\hat{\sigma}^2\frac{n}{n-p-1}$ is an unbiased estimator of $\sigma^2$.

\section{$z-$ and $t-$ Tests for Single Coefficients}

Let's write the true standard error of the estimator $\hat{\beta}_i$
as
$\se{\hat{\beta}_i}$.  From the general theory about the variance of
$\widehat{\mathbf{\beta}}$,
\begin{equation}
\se{\hat{\beta}_i} = \sqrt{\sigma^2 {(\mathbf{x}^T\mathbf{x})^{-1}_{i+1, i+1}}}
\end{equation}
(Why $i+1$?)  Further, from the Gaussian distribution of
$\widehat{\mathbf{\beta}}$,
\begin{equation}
\frac{\hat{\beta}_i - \beta_i}{\se{\hat{\beta}_i}} \sim N(0,1)
\end{equation}
If we know $\sigma^2$, so that we can compute $\se{\hat{\beta}_i}$,
we can use this to either test hypotheses about the exact value of
$\beta_i$, or to form confidence intervals.  Specifically, a $1-\alpha$
CI would be
\begin{equation}
\hat{\beta}_i \pm z({\alpha/2}) \se{\hat{\beta}_i}
\end{equation}
with $z_{p}$ being the $p^{\mathrm{th}}$ quantile of the standard Gaussian
distribution.

If we use instead the unbiased estimate of $\sigma^2$, $\hat{\sigma}^2\frac{n}{n-p-1}$, to obtain an estimate $\sehat{\hat{\beta}_i}$, we find rather
\begin{equation}
\frac{\hat{\beta_i} - \beta_i}{\sehat{\hat{\beta}_i}} \sim t_{n-p-1}
\end{equation}
The reasoning for this is exactly parallel to why we got $t_{n-2}$ distributions
for simple linear regression, so I won't rehearse it here.  It follows
that
\begin{equation}
\hat{\beta}_i \pm t_{n-p-1}(\alpha/2)\sehat{\hat{\beta}_i}
\end{equation}
is a $1-\alpha$ confidence interval for $\beta_i$.  This is implemented
in the \texttt{confint} function, when applied to the output of \texttt{lm}.


As $n\rightarrow\infty$,
this becomes
\begin{equation}
\hat{\beta}_i \pm z(\alpha/2) \hat{\sigma} \sqrt{(\mathbf{x}^T\mathbf{x})^{-1}_{i+1,i+1}}
\end{equation}
which is often a quite practical alternative to the $t$-based interval.

\subsection{What, Exactly, Is \texttt{summary} Testing?}

When you run \texttt{summary} on the output of \texttt{lm}, part of what it
delivers is a table containing estimated coefficients and standard errors,
along with a $t$-statistic and a $p$-value for each one.  It is important
to be very clear about the hypothesis being tested here.  There is
in fact a different null hypothesis for each row of the table.  The
null hypothesis for $\beta_i$ is that
\begin{equation}
Y = \beta_0 + \beta_1 X_1 + \ldots \beta_{i-1} X_{i-} + 0 X_i + \beta_{i+1} X_i + \ldots + \beta_p X_p + \epsilon
\end{equation}
with $\epsilon$ being mean-zero, constant-variance independent Gaussian noise.
The alternative hypothesis is that
\begin{equation}
Y = \beta_0 + \beta_1 X_1 + \ldots \beta_{i-1} X_{i-} + \beta_i X_i + \beta_{i+1} X_i + \ldots + \beta_p X_p + \epsilon
\end{equation}
with $\beta_i \neq 0$, and the same assumptions about $\epsilon$.  This matters
because {\em whether the null hypothesis is true or not depends on what other
  variables are included in the model}.  The optimal coefficient on $X_i$ might
be zero with one set of covariates and non-zero with another.  The $t$ test is,
by its nature, incapable of saying whether $X_i$ should be included in the
model or not.

(This is in addition to the usual cautions about whether testing $\beta_i=0$ is
really informative, about not mistaking ``detectably different from zero''
for ``important'', and about how any $\beta_i \neq 0$ will eventually
have a $p$-value arbitrarily close to 0.)

\subsection{No, Really, Whether Coefficients Are Zero Changes with the Covariates}
\label{sec:betas-change-with-covariates}

Here is the simplest situation I know of which illustrates that the true
(optimal or population-level) coefficient of a given predictor variable changes
with the other variables included in the model.  Suppose that the true model is
\begin{equation}
Y = \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \epsilon
\end{equation}
with all the usual assumptions being met.  Without knowing this, we instead
estimate the model
\begin{equation}
Y = \gamma_0 + \gamma_1 X_1 + \eta
\end{equation}
We know, from our study of the simple linear model, that the (optimal
or population) value of $\gamma_1$ is
\begin{equation}
\gamma_1 = \frac{\Cov{X_1, Y}}{\Var{X_1}}
\end{equation}
Substituting in for $Y$,
\begin{eqnarray}
\gamma_1 & = & \frac{\Cov{X_1, \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \epsilon}}{\Var{X_1}}\\
& = & \frac{\Cov{X_1, \beta_0} + \Cov{X_1, \beta_1 X_1} + \Cov{X_1, \beta_2 X_2} + \Cov{X_1,\epsilon}}{\Var{X_1}}\\
& = & \frac{0 + \beta_1 \Cov{X_1, X_1} + \beta_2 \Cov{X_1,X_2} + 0}{\Var{X_1}}\\
& = & \beta_1 + \beta_2 \frac{\Cov{X_1, X_2}}{\Var{X_1}}
\end{eqnarray}
Thus, even if $\beta_1= 0$, we can easily have $\gamma_1 \neq 0$, and vice
versa.  (See also Exercise \ref{exercise:multiple-true-models}.)

\section{Variance  Ratio ($F$) Tests for Multiple Coefficients Being Zero}

If we want to test whether a group of multiple coefficients are all
simultaneously zero, the traditional approach is a variance ratio or $F$ test.
To lay everything out, the null hypothesis is that
\begin{equation}
Y = \beta_0 + \beta_1 X_1 + \ldots + \beta_q X_q + 0 X_{q+1} + \ldots + 0 X_p + \epsilon
\end{equation}
while the alternative is
\begin{equation}
Y = \beta_0 + \beta_1 X_1 + \ldots + \beta_q X_q + \beta_{q+1} X_{q+1} + \ldots + \beta_p X_p + \epsilon
\end{equation}
with at least one of the coefficients $\beta_{q+1}, \ldots \beta_p \neq 0$.
The null hypothesis, then, is that in a linear model which includes all the
predictors $X_1, \ldots X_p$, the optimal coefficients for the last $p-q$
variables are all zero.

For both models, we get an estimate of $\sigma^2$, say $\hat{\sigma}^2_{null}$
for the null model (with coefficients fixed at zero) and
$\hat{\sigma}^2_{full}$ for the full model.  Because the null model is a
special case of the full model, and we estimate parameters in each case by
minimizing the MSE, $\hat{\sigma}^2_{null} \geq \hat{\sigma}^2_{full}$.

Following reasoning exactly parallel to the way
we got the $F$ test for the simple linear regression model,
\begin{equation}
\frac{n\hat{\sigma}^2_{full}}{\sigma^2} \sim \chi^2_{n-p-1}
\end{equation}
while, under the null hypothesis,
\begin{equation}
\frac{n(\hat{\sigma}^2_{null} - \hat{\sigma}^2_{full})}{\sigma^2} \sim \chi^2_{p-q}
\end{equation}
and so (again under the null hypothesis)
\begin{equation}
\frac{(\hat{\sigma}^2_{null} - \hat{\sigma}^2_{full})/(p-q)}{\hat{\sigma}^2_{full}/(n-p-1)} \sim F_{p-q, n-p-1}
\end{equation}

We therefore reject the null hypothesis when the test statistic
\begin{equation}
\label{eqn:f-stat}
F = \frac{(\hat{\sigma}^2_{null} -
  \hat{\sigma}^2_{full})/(p-q)}{\hat{\sigma}^2_{full}/(n-p-1)}
\end{equation}
is too large
compared to the $F_{p-q, n-p-1}$ distribution.  This is why this is called an
$F$ test for this set of regression coefficients.  If we're not testing
all the coefficients at once, this is a {\bf partial} $F$ test.

The proper interpretation of this test is ``Does letting the slopes for
$X_{q+1}, \ldots X_p$ be non-zero reduce the MSE more than we would expect just
by noise?''  As $n$ grows, increasingly small improvements will become clearly
detectable as not-noise, so increasingly small but non-zero sets of
coefficients will be detected as significant by the $F$ test.


\paragraph{Cautions} The variance ratio test does not test any of the following:
\begin{itemize}
\item Whether some variable not among $X_1, \ldots X_p$ ought to be included in
  the model.
\item Whether the relationship between $Y$ and the $X_i$ is linear.
\item Whether the Gaussian noise assumption holds.
\item Whether any of the other modeling assumptions hold.
\end{itemize}

\subsection{All Slopes at Once}

An obvious special case is the hypothesis that all the coefficients are
zero.  That is, the null hypothesis is
\begin{equation}
Y = \beta_0 + 0 X_1 + \ldots + 0 X_p + \epsilon
\end{equation}
with the alternative being the full model
\begin{equation}
Y = \beta_0 + \beta_1 X_1 + \ldots + \beta_p X_p + \epsilon
\end{equation}
The estimate of $\sigma^2$ under the null is the sample variance of $Y$,
$s^2_Y$, so the test statistic becomes
\begin{equation}
\frac{(s^2_Y - \hat{\sigma}^2_{full})/p}{\hat{\sigma}^2_{full}/(n-p-1)}
\end{equation}
whose distribution under the null is $F_{p, n-p-1}$.

This {\bf full} $F$ test is often called a test of the significance of the
whole regression.  This is true, but has to be understood in a very specific
sense.  We are testing whether, if $Y$ is linearly regressed on $X_1, \ldots
X_p$ and only on those variables, the reduction in the MSE from actually
estimating slopes over just using a flat regression surface is bigger than we'd
expect from pure noise.  Once again, the test has no power to detect violations
of any of the modeling assumptions.  (See the discussion of the $F$ test
for simple linear regression in Lecture 10.)

\subsection{Variance Ratio Tests in R}

This is most easily done through the \texttt{anova} function.  We fit
the null model and the full model, both with \texttt{lm}, and then
pass them to the \texttt{anova} function:
<<>>=
mobility <- read.csv("http://www.stat.cmu.edu/~cshalizi/mreg/15/dap/1/mobility.csv")
mob.null <- lm(Mobility ~ Commute, data=mobility)
mob.full <- lm(Mobility ~ Commute + Latitude + Longitude, data=mobility)
anova(mob.null, mob.full)
@

The second row tells us that the full model has two more parameters than
the null, that $n(\hat{\sigma}^2_{null} - \hat{\sigma}^2_{full}) =
\Sexpr{anova(mob.null, mob.full)[["Sum of Sq"]][2]}$, and then what the
variance ratio or $F$ statistic and the corresponding $p$-value are.  Here, we
learn that the decrease in the root-MSE which comes from adding latitude and
longitude as predictors, while very small
($\Sexpr{signif(100*sqrt(mean(residuals(mob.null)^2)-mean(residuals(mob.full)^2)),2)}$
percentage points) is large enough that it is unlikely to have arisen by
capitalizing on noise\footnote{Once again, this presumes that the only two
  possibilities in the world are a completely-correct linear-Gaussian model
  with just commuting time as a predictor, and a completely-correct
  linear-Gaussian model with commuting time, latitude and longitude as
  predictors.}.

\subsection{Variable Deletion via $F$ Tests}

It's not uncommon to use $F$ tests for variable deletion: pick your least
favorite set of predictors, test whether all of their $\beta$s are zero, and,
if so, delete them from the model (and re-estimate).  Presuming that we can
trust the modeling assumptions, there are still a few points about this
procedure which are slightly dubious, or at least call for much more caution
than is often exercised.

\paragraph{Statistical power} The test controls the probability of rejecting
when the null is true --- it guarantees that if $\mathbf{\beta}_q =
\mathbf{0}$, we have a low probability of rejecting that null hypothesis.  For
deletion to be reliable, however, we'd want a low probability of {\em missing}
variables with non-zero coefficients, i.e., a low probability of retaining the
null hypothesis when it's wrong, or high power to detect departures from the
null.  Power cannot be read off from the $p$-value, and grows with the
magnitude of the departure from the null.  One way to get at this is, as usual,
to complement the hypothesis test with a confidence set for the coefficients in
question.  Ignoring variables whose coefficients are {\em precisely} estimated
to be close to zero is much more sensible than ignoring variables because their
coefficients can only be estimated very loosely.

\paragraph{Non-transivitiy} The variance ratio test checks whether the MSE of
the smaller model is significantly or detectably worse than the MSE of the full
model.  One drawback to this is that a series of insignificant,
undetectably-small steps can add up to a significant, detectably-big change.
In mathematical jargon: ``is equal to'' is a transitive relation, so that
if $A=B$ and $B=C$, $A=C$.  But ``insignificantly different from'' is not a
transitive relation, so if $A \approx B$ and $B \approx C$, we can't
conclude $A \approx C$.

Concretely: a group of variables might show up as significant in a partial $F$
test, even though none of them was individually significant on a $t$ test in
the full model\footnote{This is yet another reason not to pay so much attention
  to the $p$-values reported by \texttt{summary}.}.  Also, if we delete
variables in stages, we can have a situation where at each stage the increase
in MSE is insignificant, but the difference between the full model and the
final model is highly significant.


\subsection{Likelihood Ratio Tests}

As with the $F$ test for simple linear models, there is an alternative
based on the likelihood ratio.  As with the simple model, the
log-likelihood of the model, at the maximum likelihood estimate, is
\begin{equation}
-\frac{n}{2}(1+\log{2\pi}) - \frac{n}{2}\log{\hat{\sigma}^2}
\end{equation}
Hence the difference in log-likelihoods between the full model, with all $p$
slopes estimated, and the null model, with only $q$ slopes estimated and the
other $p-q$ fixed, is
\begin{equation}
\Lambda = -\frac{n}{2}\log{\hat{\sigma}^2_{full}} +
\frac{n}{2}\log{\hat{\sigma}^2_{null}} =
\frac{n}{2}\log{\frac{\hat{\sigma}^2_{null}}{\hat{\sigma}^2_{full}}}
\end{equation}
This is the log of the ratio of likelihoods (not the ratio of log likelihoods!)
Under the null hypothesis\footnote{Strictly speaking, this only becomes exact
  as $n\rightarrow\infty$.  This issue is that deriving the $\chi^2$
  distribution for $\Lambda$ presumes every parameter's maximum likelihood
  estimate has a Gaussian distribution around its true value (see Lecture 10),
  and while this is true for the $\hat{\beta}_i$s, it is only approximately
  true for $\hat{\sigma}^2$.  See Exercise
  \ref{exercise:chisq-approaches-gaussian}.},
\begin{equation}
2 \Lambda \sim \chi^2_{p-q}
\end{equation}
The same cautions apply to the likelihood ratio test as to the $F$ test:
it does not check modeling assumptions.

One advantage of likelihood ratio tests is that exactly the same procedure can
be used to test the hypothesis that $\mathbf{\beta}_q = \mathbf{0}$ and to test
$\mathbf{\beta}_q = \mathbf{\beta}_q^*$, for any other particular vector of
parameters.  For that matter, it can be used to test $\mathbf{c} \mathbf{\beta}
= \mathbf{r}$, where $\mathbf{c}$ is any non-random $q \times (p+1)$ matrix,
and $\mathbf{r}$ is any non-random $q \times 1$ vector.  Thus, for
example, it can be used to test the hypothesis that two slopes
are {\em equal}, or that all slopes are equal, etc.

\paragraph{Likelihood Ratio vs.\ F Tests} For linear-Gaussian models, both the
likelihood ratio and the $F$ statistic are functions of the ratio
$\hat{\sigma}^2_{null}/\hat{\sigma}^2_{full}$ (Exercise \ref{exercise:test-stats}).  For fixed $p$ and $q$, as
$n\rightarrow \infty$, the two tests deliver the same $p$-values when
$\hat{\sigma}^2_{null}/\hat{\sigma}^2_{full}$ is the same.  At finite $n$, they
are somewhat different, with the $F$ test usually giving a somewhat higher $p$
value than the $\chi^2$ test, particularly if $p$ is close to $n$.  Which test
is more {\em accurate} is another question.  The likelihood ratio test can
actually work for large $n$ when the model is mis-specified, in the sense of
telling us which wrong model is closer to the truth
\citep{Vuong-testing-non-nested-hypotheses}, while the $F$ test's refinements
over the $\chi^2$ very much depend on all the modeling assumptions being right.

\begin{figure}
<<variance-ratio-vs-likelihood-ratio, echo=FALSE>>=
n <- 15; p <- 10; q <- 9
curve(pchisq(n*log(x),df=p-q,lower.tail=FALSE),from=1,to=2,
      xlab=expression(hat(sigma)[null]^2/hat(sigma)[full]^2),ylab="p-value")
curve(pf((x-1)*(n-p-1)/(p-q),p-q,n-p-1,lower.tail=FALSE),add=TRUE,col="blue")
n2 <- 60
curve(pchisq(n2*log(x),df=p-q,lower.tail=FALSE),add=TRUE,lty="dashed")
curve(pf((x-1)*(n2-p-1)/(p-q),p-q,n2-p-1,lower.tail=FALSE),add=TRUE,
      col="blue",lty="dashed")
legend("topright",ncol=2,
       legend=c("LRT, n=15","F, n=15", "LRT, n=60", "F, n=60"),
       col=c("black","blue","black","blue"),
       lty=c("solid","solid","dashed","dashed"))
@
\caption{Difference in $p$-values obtained from using a likelihood ratio test
(black) and an $F$ test (blue) on the same data, with $p=10$, $q=9$, and $n$ either 15 (solid) or 60 (dotted).  In general, the difference between the two tests goes to zero
as $n-p$ grows.  (See source file for code.)}
\end{figure}

\clearpage


\section{Confidence Sets for Multiple Coefficients}

Suppose we want to do inference on two coefficients, say $\beta_i$ and
$\beta_j$, at once.  That means we need to come up with a two-dimensional
confidence region $C(\alpha)$, where we can say that $\Prob{(\beta_i, \beta_j)
  \in C(\alpha)} = 1-\alpha$.  This would involve the same sort of trilemma
as confidence intervals for single coefficients.  That is, one of three
things must be true:
\begin{enumerate}
\item Both $\beta_i$ and $\beta_j$ are in $C(\alpha)$; or
\item We got data which was very ($\leq \alpha$) improbable under all possible values of the parameters; or
\item Our model is wrong.
\end{enumerate}
If we trust our model, then, we can indeed be confident that both $\beta_i$
and $\beta_j$ are simultaneously in $C(\alpha)$.

Clearly, nothing depends on wanting to do inference on just two coefficients at
once; we could consider any subset of them we like, up to all $p+1$ of them.

With one parameter, intervals are the most natural confidence sets to work
with.  With more than one parameter, we have choices to make about the {\em
  shape} of the confidence set.  The two easiest ones to work with are
rectangular boxes, and ellipsoids.

\subsection{Confidence Boxes or Rectangles}
\label{sec:confidence-boxes}

The natural thing to want to do is to take a confidence interval for
each coefficient and put them together into a confidence box or
rectangle.  For instance, using the $t$-distribution CI for $\beta_i$ and $\beta_j$, the box would be
\begin{equation}
(\hat{\beta}_i \pm t_{n-p-1}(\alpha/2)\sehat{\hat{\beta}_i}) \times  (\hat{\beta}_j \pm t_{n-p-1}(\alpha/2)\sehat{\hat{\beta}_j})
\end{equation}
(And similarly for three or more parameters.)
This is, however, not quite right as I've written it.  The problem is that while
each interval covers its true coefficient with high probability, both
intervals {\em simultaneously} cover the pair of parameters is a different story.  Let me abbreviate the interval for $\beta_i$ as $C_i(\alpha)$, likewise
the interval for $\beta_j$ is $C_j(\alpha)$.  We have
\begin{equation}
\Prob{\beta_i \in C_i(\alpha)} = 1-\alpha ~,~ \Prob{\beta_j \in C_j(\alpha)} = 1-\alpha
\end{equation}
but from this it does not follow that
\begin{equation}
\Prob{\beta_i \in C_i(\alpha), \beta_j \in C_j(\alpha)} = 1-\alpha
\end{equation}
To see this, let's consider the complementary event: it's
\begin{equation}
\beta_i \not \in C_i(\alpha) \vee \beta_j \not \in C_j(\alpha)
\end{equation}
writing $\vee$ for logical-or\footnote{That is, $A \vee B$ means in ordinary
  English ``$A$ is true or $B$ is true or both are true''.}
By basic probability,
\begin{equation}
\Prob{\beta_i \not \in C_i(\alpha) \vee \beta_j \not \in C_j(\alpha)} = \Prob{\beta_i \not \in C_i(\alpha)} + \Prob{\beta_j \not \in C_j(\alpha)} - \Prob{\beta_i \not \in C_i(\alpha), \beta_j \not \in C_j(\alpha)}
\end{equation}
Since $C_i$ and $C_j$ are $1-\alpha$-confidence sets,
\begin{equation}
\Prob{\beta_i \not \in C_i(\alpha) \vee \beta_j \not \in C_j(\alpha)} = 2\alpha - \Prob{\beta_i \not \in C_i(\alpha), \beta_j \not \in C_j(\alpha)} \leq 2\alpha
\end{equation}
So $C_i(\alpha) \times C_j(\alpha)$ isn't itself a $1-\alpha$ confidence set;
its real confidence level could be as little as $1-2\alpha$.  If we had
been looking at $q$ coefficients at once, the confidence level might have been
as low as $1-q\alpha$.

This suggests, however, a very simple, if sometimes over-cautious, way of
building a confidence box.  If we want the final box to have a $1-\alpha$
confidence level, and we're dealing with $q$ coefficients at once, we calculate
$1-\alpha/q$ confidence levels for each coefficient, say $C_i(\alpha/q)$, and then set
\begin{equation}
C(\alpha) = C_1(\alpha/q) \times C_2(\alpha/q) \times \ldots \times C_q(\alpha/q)
\end{equation}
By our reasoning above, this final $C(\alpha)$ will cover all $q$ parameters at
once with probability at least $1-\alpha$.

\begin{figure}
<<confidence-box, echo=FALSE>>=
n <- 50
beta1.hat <- 1
beta2.hat <- 2
se1 <- 1
se2 <- 0.5
alpha <- 0.05
beta1.lo <- beta1.hat + se1*qt(alpha/2, df=n-3)
beta1.hi <- beta1.hat - se1*qt(alpha/2, df=n-3)
beta2.lo <- beta2.hat + se2*qt(alpha/2, df=n-3)
beta2.hi <- beta2.hat - se2*qt(alpha/2, df=n-3)
beta1.lo.ex <- beta1.hat + se1*qt((alpha/2)/2, df=n-3)
beta1.hi.ex <- beta1.hat - se1*qt((alpha/2)/2, df=n-3)
beta2.lo.ex <- beta2.hat + se2*qt((alpha/2)/2, df=n-3)
beta2.hi.ex <- beta2.hat - se2*qt((alpha/2)/2, df=n-3)
plot(beta1.hat, beta2.hat, xlim=c(-5,5), ylim=c(0,5),
     xlab=expression(beta[1]), ylab=expression(beta[2]))
segments(x0=c(beta1.hat,beta1.lo), y0=c(beta2.lo,beta2.hat),
         x1=c(beta1.hat,beta1.hi), y1=c(beta2.hi,beta2.hat), col="grey")
polygon(x=c(beta1.lo.ex,beta1.lo.ex,beta1.hi.ex,beta1.hi.ex),
        y=c(beta2.lo.ex,beta2.hi.ex,beta2.hi.ex,beta2.lo.ex))
@
\caption{Grey lines: 95\% confidence intervals for two coefficients, based on
  inverting $t$ tests, and so centered at the point estimate (dot).  Black box:
  a 95\% confidence rectangle for both coefficients simultaneously.  Notice
  that the grey lines do not touch the sides of the rectangle; the latter
  correspond to 97.5\% CIs for each coefficient.  If we did draw the rectangle
  corresponding to the grey lines, its actual confidence level could be as low
  as 90\%.  (See source file for code.)}
\label{fig:confidence-rectangle}
\end{figure}

This trick of building a $1-\alpha$ confidence box for $q$ parameters at once
from $q$ $1-\alpha/q$ confidence intervals is completely generic; it doesn't
just work on regression coefficients, but for any parameters of any statistical
model at all.  For more on it, see \S \ref{sec:further-reading} below.

\clearpage


\subsection{Confidence Balls or Ellipsoids}
\label{sec:confidence-ellipsoids}

An alternative to confidence boxes is to try to make confidence {\em balls}.
To see how this could work, suppose first that $\hat{\beta_i}$ and
$\hat{\beta_j}$ were uncorrelated.  Since
\begin{equation}
\frac{\hat{\beta}_i - \beta_i}{\se{\hat{\beta}_i}} \sim N(0,1)
\end{equation}
(and likewise for $\beta_j$), we would have\footnote{Because when $Z_1, \ldots Z_d$ are independent $N(0,1)$ variables, $\sum_{i}{Z_i^2} \sim \chi^2_d$.}
\begin{equation}
{\left(\frac{\hat{\beta}_i - \beta_i}{\se{\hat{\beta}_i}}\right)}^2 + {\left(\frac{\hat{\beta}_j - \beta_j}{\se{\hat{\beta}_j}}\right)}^2 \sim \chi^2_2
\end{equation}
Therefore, a simultaneous $1-\alpha$ confidence region for $(\beta_i, \beta_j)$
would be the region where
\begin{equation}
{\left(\frac{\hat{\beta}_i - \beta_i}{\se{\hat{\beta}_i}}\right)}^2 + {\left(\frac{\hat{\beta}_j - \beta_j}{\se{\hat{\beta}_j}}\right)}^2 \leq \chi^2_2(1-\alpha)
\end{equation}
A little geometry shows that this region is an ellipse, its axes parallel to
the coordinate axis with the length from end to end along one axis being
$2\se{\hat{\beta}_i}\chi^2_2(1-\alpha)$, and its length along the other axis
being $2\se{\hat{\beta}_j}\chi^2_2(1-\alpha)$.

If we had $q$ different uncorrelated coefficients, the confidence region would
be the set $(\beta_1, \beta_2, \ldots \beta_q)$ where
\begin{equation}
\sum_{i=1}^{q}{{\left(\frac{\hat{\beta}_i - \beta_i}{\se{\hat{\beta}_i}}\right)}^2} \leq \chi^2_q(1-\alpha)
\end{equation}
When $q > 2$, we call this region an ``ellipsoid'' rather than an ``ellipse'',
but it's the same idea.

Usually, of course, the different coefficient estimates are correlated with
each other, so we need to do something a bit different.  If we write
$\mathbf{\beta}_q$ for the vector of coefficients we're interested in, and
$\mathbf{\Sigma}_q$ for its variance-covariance matrix, then the confidence
region is the set of all $\mathbf{\beta}_q$ where
\begin{equation}
(\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q)^T \mathbf{\Sigma}^{-1}_q
(\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q) \leq \chi^2_q(1-\alpha)
\end{equation}
This, too, is an ellipsoid, only now the axes point in the directions given by
the eigenvectors of $\mathbf{\Sigma}_q$, and the length along each axis is
proportional to the square root of the corresponding eigenvalue.  (See \S
\ref{sec:chisq-for-confidence-balls} for a derivation.)

Since $\mathbf{\Sigma}_q$ is a $q \times q$ sub-matrix of $\sigma^2
(\mathbf{x}^T\mathbf{x})^{-1}$, we can't actually use this.  We can, however,
use the appropriate sub-matrix of $\hat{\sigma}^2(\mathbf{x}^T\mathbf{x})^{-1}$
as an approximation, which becomes exact as $n\rightarrow\infty$.  Similarly,
if we use the unbiased estimate of $\sigma^2$, we replace $\chi^2_q(1-\alpha)$
with $F_{q,n-p-1}(1-\alpha)$.

\subsubsection{Confidence Ellipsoids in R}

The package \texttt{ellipse} \citep{ellipse-package} contains functions for
plotting 2D confidence ellipses.  The main function is also called
\texttt{ellipse}, which happens to have a specialized method for \texttt{lm}
models.  The usage is
<<eval=FALSE>>=
my.model <- lm(y ~ x1+x2+x3)
plot(ellipse(my.model, which=c(1,2), level=0.95))
@
Here \texttt{which} is the vector of coefficient indices (it can only be of
length 2) and \texttt{level} is the confidence level.  Notice that what
\texttt{ellipse} actually returns is a two-column array of coordinates, which
can be plotted, or passed along to other graphics functions (like
\texttt{points} or \texttt{lines}).  See Figure \ref{fig:confidence-ellipses}.


\begin{figure}
<<confidence-ellipses, echo=FALSE>>=
library(ellipse)
par(mfrow=c(3,2))
plot(ellipse(mob.full, which=c(1,2), level=1-0.05/6), type="l")
plot(ellipse(mob.full, which=c(1,3), level=1-0.05/6), type="l")
plot(ellipse(mob.full, which=c(1,4), level=1-0.05/6), type="l")
plot(ellipse(mob.full, which=c(2,3), level=1-0.05/6), type="l")
plot(ellipse(mob.full, which=c(2,4), level=1-0.05/6), type="l")
plot(ellipse(mob.full, which=c(3,4), level=1-0.05/6), type="l")
@
<<eval=FALSE>>=
<<confidence-ellipses>>
@
\caption{Confidence ellipses for every pair of coefficients in the model where
  economic mobility is regressed on the prevalence of short commutes, latitude
  and longitude.  (Remember the intercept is the first coefficient.)  Why do I use this odd-looking confidence level?}
\label{fig:confidence-ellipses}
\end{figure}

Three-dimensional confidence ellipsoids can be made with the \texttt{rgl}
library \citep{rgl-package}.  While confidence ellipsoids exist in any number
of dimensions, they can't really be visualized when $q > 3$.

\clearpage

\subsubsection{Where the $\chi^2_q$ Comes From}
\label{sec:chisq-for-confidence-balls}

To see why this should be so, we need to do some linear algebra, to turn a
Gaussian random vector with correlations and unequal variances into a Gaussian
random vector where the coordinates are all $\sim N(0,1)$ and independent of
each other.  The starting point is the fact that $\mathbf{\Sigma}_q$ is a
square, symmetric, positive-definite matrix.  Therefore it can be written as
follows:
\begin{equation}
\mathbf{\Sigma}_q = \mathbf{V} \mathbf{U} \mathbf{V}^T
\end{equation}
where $\mathbf{U}$ is the diagonal matrix of eigenvalues, and $\mathbf{V}$ is
the matrix whose columns are the eigenvectors; $\mathbf{V}^T$ is its transpose,
and $\mathbf{V}^T\mathbf{V} = \mathbf{I}$.  If we define
$\mathbf{\Sigma}_q^{1/2} = \mathbf{V} \mathbf{U}^{1/2}$, where
$\mathbf{U}^{1/2}$ is the diagonal matrix with the square roots of the
eigenvalues, then
\begin{eqnarray}
  \Var{\mathbf{\Sigma}_q^{-1/2} (\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q)} & = & \mathbf{\Sigma}_q^{-1/2} \Var{\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q)} (\mathbf{\Sigma}_q^{-1/2})^T\\
  & = & \mathbf{U}^{-1/2} \mathbf{V}^{-1} \mathbf{V} \mathbf{U} \mathbf{V}^T \mathbf{V} \mathbf{U}^{-1/2}\\
  & = & \mathbf{U}^{-1/2} \mathbf{U} \mathbf{U}^{-1/2}\\
  & = & \mathbf{I}
\end{eqnarray}
where the last step works because $\mathbf{U}$ and $\mathbf{U}^{-1/2}$ are both
diagonal matrices.  In other words, while the coordinates of
$\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q$ have unequal variances and are
correlated with each other, $\mathbf{\Sigma}_q^{-1/2}
(\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q)$ is a random vector where each
coordinate has variance 1 and is uncorrelated with the others.  Since the
initial vector was Gaussian, this too is Gaussian, hence
\begin{equation}
\mathbf{\Sigma}_q^{-1/2} (\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q) \sim
MVN(\mathbf{0}, \mathbf{I})
\end{equation}
Therefore
\begin{equation}
\label{eqn:chisq-from-summing-indep-gaussians} \left(\mathbf{\Sigma}_q^{-1/2} (\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q)\right)^T \mathbf{\Sigma}_q^{-1/2} (\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q) \sim \chi^2_q
\end{equation}
since it's a sum of $q$ squared, independent $N(0,1)$ variables.

On the other hand,
\begin{eqnarray}
\lefteqn{  \left(\mathbf{\Sigma}^{-1/2}_q(\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q)\right)^T \left(\mathbf{\Sigma}^{-1/2}_q(\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q)\right)} & &\\
\nonumber & = & (\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q)^T \left(\mathbf{\Sigma}^{-1/2}_q\right)^T \mathbf{\Sigma}^{-1/2}_q (\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q)\\
  & = & (\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q)^T \mathbf{V} \mathbf{U}^{-1/2} \mathbf{U}^{-1/2} \mathbf{V}^{-1} (\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q)\\
& = & (\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q)^T \mathbf{V} \mathbf{U}^{-1} \mathbf{V}^{-1} (\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q)\\
& = & (\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q)^T \mathbf{\Sigma}^{-1}_q (\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q)
\label{eqn:oh-look-a-sum-of-indep-gaussians}
\end{eqnarray}

Combining Eqs.\ \ref{eqn:chisq-from-summing-indep-gaussians} and
\ref{eqn:oh-look-a-sum-of-indep-gaussians},
\begin{equation}
(\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q)^T \mathbf{\Sigma}^{-1}_q (\widehat{\mathbf{\beta}}_q - \mathbf{\beta}_q) \sim \chi^2_{q}
\end{equation}
as was to be shown.

\section{Further Reading}
\label{sec:further-reading}

Variance and likelihood ratio tests go back to the period of the 1910s--1930s;
see references in Lecture 10.  Further exposition can be found in any textbook
on regression, or general mathematical statistics.

The trick in \S \ref{sec:confidence-boxes}, of getting an over-all confidence
level of $1-\alpha$ for $q$ parameters simultaneously, by demanding the higher
confidence level of $1-\alpha/q$ for each one separately, is one use of an
important tool called {\bf Bonferroni correction} or {\bf Bonferroni
  adjustment}\footnote{Computer scientists, and some mathematicians, call it a
  ``union bound'' --- can you explain why?}.  For an account of the role of
this general idea in probability theory, see
\citet{Galambos-and-Simonelli-on-Boneferroni}.  Bonferroni correction is also
often used for hypothesis testing: if we test $q$ distinct hypotheses, and we
want to have the probability of making {\em no} false rejections be $\alpha$,
we can achieve that by having each test be of size $\alpha/q$.  Indeed,
we could give each test whatever size we like, so long as the sum of the tests is $\alpha$.

One sometimes encounters the mis-understanding that Bonferroni correction
requires the test statistics or confidence intervals to be statistically
independent (e.g., \citealt{FGAshby-stats-of-fMRI}); as you can see from the
argument above, this is just wrong.  What is true is that Bonferroni correction
is very cautious, and that one can sometimes come up with less conservative
ways of doing multiple inference if one either uses more detailed information
about how the statistics relate to each other (as in \S
\ref{sec:confidence-ellipsoids}), or one is willing to tolerate a certain
number of false positives.  The latter idea leads to important work on multiple
testing and ``false discovery control'', which is outside the scope of this
course, but see \citet{Benjamini-Hochberg-fdr,
  Genovese-Wasserman-stochastic-process-for-false-discovery}, and, for an
unforgettable demonstration of how ignoring multiple testing issues leads to
nonsense, \citet{Bennett-et-al-the-dead-salmon}.


\section{Exercises}

To think through or to practice on, not to hand in.
\begin{enumerate}
\item \label{exercise:multiple-true-models} In the scenario of \S
  \ref{sec:betas-change-with-covariates}, is it possible for both $\epsilon$
  and $\eta$ to obey the Gaussian noise assumption?  That is, it is possible to
  have $\epsilon \sim N(0, \sigma^2_{\epsilon})$, independent of $X_1$ and
  $X_2$, and to have $\eta \sim N(0, \sigma^2_{\eta})$, independent of $X_1$?
  {\em Hint:} Suppose $X_1$ and $X_2$ are jointly Gaussian, and, for
  simplicity, that both have mean 0.
\item \label{exercise:test-stats}
  \begin{enumerate}
  \item Show that the variance ratio test statistic (Eq.\ \ref{eqn:f-stat})
    depends on the data only through the ratio
    $\hat{\sigma}^2_{null}/\hat{\sigma}^2_{full}$.
  \item Show that as $\hat{\sigma}^2_{null} \rightarrow \hat{\sigma}^2_{full}$,
    \begin{equation}
      \log{\frac{\hat{\sigma}^2_{null}}{\hat{\sigma}^2_{full}}} \rightarrow
      \frac{\hat{\sigma}^2_{null} - \hat{\sigma}^2_{full}}{\hat{\sigma}^2_{full}}
    \end{equation}
  \end{enumerate}
\item Lecture 8 argued that every confidence set comes from inverting a
  hypothesis test.  What is the hypothesis test corresponding to the confidence
  boxes of \S \ref{sec:confidence-boxes}?  That is, find an explicit form of
  the test statistic and of the rejection region.
\item \label{exercise:chisq-approaches-gaussian} Let $X_n \sim \chi^2_{n-p}$,
  with fixed $p$.
  \begin{enumerate}
  \item Show that $X_n/n$ approaches a constant $a$, and find $a$.
  \item Show that $(X_n - a)/\sqrt{n}$ approaches a Gaussian distribution, and
    find the expectation and variance.  {\em Hint:} show that the moment
    generating functions converge.
  \item Combine the previous results to write the limiting distribution of
    $X_n/n$ as a Gaussian, whose parameters (may) change with $n$.
  \end{enumerate}
\end{enumerate}


\bibliography{locusts}
\bibliographystyle{crs}


\end{document}