## ----echo=FALSE----------------------------------------------------------
library(knitr)
opts_chunk$set(size="small",background="white", highlight=FALSE, cache=TRUE, autdep=TRUE, tidy=TRUE, options(show.signif.stars=FALSE))

## ----linear-response-and-quadratic-heteroskedasticity, echo=FALSE, out.height="0.25\\textheight"----
plot(0,type="n",xlim=c(-5,5),ylim=c(-15,5), xlab="x", ylab="y")
abline(a=3,b=-2)
curve(1+x^2/2,add=TRUE,col="grey")

## ----set-up-running-example, include=FALSE-------------------------------
# Generate data for the running example
# X: Gaussian but with more-than-standard variance
x = rnorm(150,0,3)
# Y: Linearly dependent on X but with variance that grows with X
# (heteroskedastic)
y = 3-2*x + rnorm(length(x),0,sapply(x,function(x){1+0.5*x^2}))

## ----x-y-scatterplot-with-ols-line, echo=FALSE---------------------------
# Plot the data
plot(x,y)
# Plot the true regression line
abline(a=3,b=-2,col="grey")
# Fit by ordinary least squares
fit.ols = lm(y~x)
# Plot that line
abline(fit.ols,lty="dashed")

## ----ols-residuals-vs-x, echo=FALSE--------------------------------------
par(mfrow=c(1,2))
plot(x,residuals(fit.ols))
plot(x,(residuals(fit.ols))^2)
par(mfrow=c(1,1))

## ----code:ols-heterosked-------------------------------------------------
# Generate more random samples from the same model and the same x values,
# but different y values
# Inputs: number of samples to generate
# Presumes: x exists and is defined outside this function
# Outputs: errors in linear regression estimates
ols.heterosked.example = function(n) {
  y = 3-2*x + rnorm(n,0,sapply(x,function(x){1+0.5*x^2}))
  fit.ols = lm(y~x)
  # Return the errors
  return(fit.ols$coefficients - c(3,-2))
}

# Calculate average-case errors in linear regression estimates (SD of
# slope and intercept)
# Inputs: number of samples per replication, number of replications (defaults
#         to 10,000)
# Calls: ols.heterosked.example
# Outputs: standard deviation of intercept and slope
ols.heterosked.error.stats = function(n,m=10000) {
  ols.errors.raw = t(replicate(m,ols.heterosked.example(n)))
  # transpose gives us a matrix with named columns
  intercept.sd = sd(ols.errors.raw[,"(Intercept)"])
  slope.sd = sd(ols.errors.raw[,"x"])
  return(list(intercept.sd=intercept.sd,slope.sd=slope.sd))
}

## ----include=FALSE-------------------------------------------------------
some.sims <- ols.heterosked.error.stats(length(x))

## ----x-y-scatterplot-with-wls-line, echo=FALSE, out.width="0.5\\textwidth"----
# Plot the data
plot(x,y)
# Plot the true regression line
abline(a=3,b=-2,col="grey")
# Fit by ordinary least squares
fit.ols = lm(y~x)
# Plot that line
abline(fit.ols,lty="dashed")
fit.wls = lm(y~x, weights=1/(1+0.5*x^2))
abline(fit.wls,lty="dotted")

## ----include=FALSE-------------------------------------------------------
wls.coefs <- coefficients(summary(fit.wls))

## ------------------------------------------------------------------------
### As previous two functions, but with weighted regression

# Generate random sample from model (with fixed x), fit by weighted least
# squares
# Inputs: number of samples
# Presumes: x fixed outside function
# Outputs: errors in parameter estimates
wls.heterosked.example = function(n) {
  y = 3-2*x + rnorm(n,0,sapply(x,function(x){1+0.5*x^2}))
  fit.wls = lm(y~x,weights=1/(1+0.5*x^2))
  # Return the errors
  return(fit.wls$coefficients - c(3,-2))
}

# Calculate standard errors in parameter estiamtes over many replications
# Inputs: number of samples per replication, number of replications (defaults
#         to 10,000)
# Calls: wls.heterosked.example
# Outputs: standard deviation of estimated intercept and slope
wls.heterosked.error.stats = function(n,m=10000) {
  wls.errors.raw = t(replicate(m,wls.heterosked.example(n)))
  # transpose gives us a matrix with named columns
  intercept.sd = sd(wls.errors.raw[,"(Intercept)"])
  slope.sd = sd(wls.errors.raw[,"x"])
  return(list(intercept.sd=intercept.sd,slope.sd=slope.sd))
}

## ----funnel-plot, echo=FALSE---------------------------------------------
mobility <- read.csv("http://www.stat.cmu.edu/~cshalizi/mreg/15/lectures/24--25/mobility2.csv")
plot(Mobility ~ Population, data=mobility, log="x", ylim=c(0,0.5))

## ----eval=FALSE----------------------------------------------------------
## mobility <- read.csv("http://www.stat.cmu.edu/~cshalizi/mreg/15/lectures/24--25/mobility2.csv")
## plot(Mobility ~ Population, data=mobility, log="x", ylim=c(0,0.5))

## ------------------------------------------------------------------------
mobility$MobSE <- with(mobility, sqrt(Mobility*(1-Mobility)/Population))

## ----mobility-scatterplot, echo=FALSE------------------------------------
plot(Mobility ~ Commute, data=mobility,
     xlab="Fraction of workers with short commutes",
     ylab="Rate of economic mobility", pch=19, cex=0.2)
with(mobility, segments(x0=Commute, y0=Mobility+2*MobSE,
                        x1=Commute, y1=Mobility-2*MobSE, col="blue"))
mob.lm <- lm(Mobility ~ Commute, data=mobility)
mob.wlm <- lm(Mobility ~ Commute, data=mobility, weight=1/MobSE^2)
abline(mob.lm)
abline(mob.wlm, col="blue")

## ----eval=FALSE----------------------------------------------------------
## plot(Mobility ~ Commute, data=mobility,
##      xlab="Fraction of workers with short commutes",
##      ylab="Rate of economic mobility", pch=19, cex=0.2)
## with(mobility, segments(x0=Commute, y0=Mobility+2*MobSE,
##                         x1=Commute, y1=Mobility-2*MobSE, col="blue"))
## mob.lm <- lm(Mobility ~ Commute, data=mobility)
## mob.wlm <- lm(Mobility ~ Commute, data=mobility, weight=1/MobSE^2)
## abline(mob.lm)
## abline(mob.wlm, col="blue")

## ----ols-residuals-with-smooth, echo=FALSE-------------------------------
plot(x,residuals(fit.ols)^2,ylab="squared residuals")
curve((1+x^2/2)^2,col="grey",add=TRUE)
var1 <- smooth.spline(x=x, y=log(residuals(fit.ols)^2), cv=TRUE)
grid.x <- seq(from=min(x),to=max(x),length.out=300)
lines(grid.x, exp(predict(var1,x=grid.x)$y))

## ------------------------------------------------------------------------
fit.wls1 <- lm(y~x,weights=1/exp(var1$y))
coefficients(fit.wls1)
var2 <- smooth.spline(x=x, y=log(residuals(fit.wls1)^2), cv=TRUE)

## ----fit-wls1,echo=FALSE-------------------------------------------------
fit.wls1 <- lm(y~x,weights=1/exp(var1$y))
par(mfrow=c(1,2))
plot(x,y)
abline(a=3,b=-2,col="grey")
abline(fit.ols,lty="dashed")
abline(fit.wls1,lty="dotted")
plot(x,(residuals(fit.ols))^2,ylab="squared residuals")
points(x,residuals(fit.wls1)^2,pch=15)
lines(grid.x, exp(predict(var1,x=grid.x)$y))
var2 <- smooth.spline(x=x, y=log(residuals(fit.wls1)^2), cv=TRUE)
curve((1+x^2/2)^2,col="grey",add=TRUE)
lines(grid.x, exp(predict(var2,x=grid.x)$y),lty="dotted")
par(mfrow=c(1,1))

## ----fit-wls2------------------------------------------------------------
fit.wls2 <- lm(y~x,weights=1/exp(var2$y))
coefficients(fit.wls2)
var3 <- smooth.spline(x=x, y=log(residuals(fit.wls2)^2), cv=TRUE)

## ----fit-wls3_and_4------------------------------------------------------
fit.wls3 <- lm(y~x,weights=1/exp(var3$y))
coefficients(fit.wls3)
var4 <- smooth.spline(x=x, y=log(residuals(fit.wls3)^2), cv=TRUE)
fit.wls4 <- lm(y~x,weights=1/exp(var4$y))
coefficients(fit.wls4)

## ------------------------------------------------------------------------
iterative.wls <- function(x,y,tol=0.01,max.iter=100) {
  iteration <- 1
  old.coefs <- NA
  regression <- lm(y~x)
  coefs <- coefficients(regression)
  while (is.na(old.coefs) ||
         ((max(abs(coefs - old.coefs)) > tol) && (iteration < max.iter))) {
    variance <- smooth.spline(x=x, y=log(residuals(regression)^2), cv=TRUE)
    old.coefs <- coefs
    iteration <- iteration+1
    regression <- lm(y~x,weights=1/exp(variance$y))
    coefs <- coefficients(regression)
  }
  return(list(regression=regression,variance=variance,iterations=iteration))
}