## ----echo=FALSE---------------------------------------------------------- library(knitr) opts_chunk$set(size="small",background="white", highlight=FALSE, cache=TRUE, autdep=TRUE, tidy=TRUE, options(show.signif.stars=FALSE)) ## ----linear-response-and-quadratic-heteroskedasticity, echo=FALSE, out.height="0.25\\textheight"---- plot(0,type="n",xlim=c(-5,5),ylim=c(-15,5), xlab="x", ylab="y") abline(a=3,b=-2) curve(1+x^2/2,add=TRUE,col="grey") ## ----set-up-running-example, include=FALSE------------------------------- # Generate data for the running example # X: Gaussian but with more-than-standard variance x = rnorm(150,0,3) # Y: Linearly dependent on X but with variance that grows with X # (heteroskedastic) y = 3-2*x + rnorm(length(x),0,sapply(x,function(x){1+0.5*x^2})) ## ----x-y-scatterplot-with-ols-line, echo=FALSE--------------------------- # Plot the data plot(x,y) # Plot the true regression line abline(a=3,b=-2,col="grey") # Fit by ordinary least squares fit.ols = lm(y~x) # Plot that line abline(fit.ols,lty="dashed") ## ----ols-residuals-vs-x, echo=FALSE-------------------------------------- par(mfrow=c(1,2)) plot(x,residuals(fit.ols)) plot(x,(residuals(fit.ols))^2) par(mfrow=c(1,1)) ## ----code:ols-heterosked------------------------------------------------- # Generate more random samples from the same model and the same x values, # but different y values # Inputs: number of samples to generate # Presumes: x exists and is defined outside this function # Outputs: errors in linear regression estimates ols.heterosked.example = function(n) { y = 3-2*x + rnorm(n,0,sapply(x,function(x){1+0.5*x^2})) fit.ols = lm(y~x) # Return the errors return(fit.ols$coefficients - c(3,-2)) } # Calculate average-case errors in linear regression estimates (SD of # slope and intercept) # Inputs: number of samples per replication, number of replications (defaults # to 10,000) # Calls: ols.heterosked.example # Outputs: standard deviation of intercept and slope ols.heterosked.error.stats = function(n,m=10000) { ols.errors.raw = t(replicate(m,ols.heterosked.example(n))) # transpose gives us a matrix with named columns intercept.sd = sd(ols.errors.raw[,"(Intercept)"]) slope.sd = sd(ols.errors.raw[,"x"]) return(list(intercept.sd=intercept.sd,slope.sd=slope.sd)) } ## ----include=FALSE------------------------------------------------------- some.sims <- ols.heterosked.error.stats(length(x)) ## ----x-y-scatterplot-with-wls-line, echo=FALSE, out.width="0.5\\textwidth"---- # Plot the data plot(x,y) # Plot the true regression line abline(a=3,b=-2,col="grey") # Fit by ordinary least squares fit.ols = lm(y~x) # Plot that line abline(fit.ols,lty="dashed") fit.wls = lm(y~x, weights=1/(1+0.5*x^2)) abline(fit.wls,lty="dotted") ## ----include=FALSE------------------------------------------------------- wls.coefs <- coefficients(summary(fit.wls)) ## ------------------------------------------------------------------------ ### As previous two functions, but with weighted regression # Generate random sample from model (with fixed x), fit by weighted least # squares # Inputs: number of samples # Presumes: x fixed outside function # Outputs: errors in parameter estimates wls.heterosked.example = function(n) { y = 3-2*x + rnorm(n,0,sapply(x,function(x){1+0.5*x^2})) fit.wls = lm(y~x,weights=1/(1+0.5*x^2)) # Return the errors return(fit.wls$coefficients - c(3,-2)) } # Calculate standard errors in parameter estiamtes over many replications # Inputs: number of samples per replication, number of replications (defaults # to 10,000) # Calls: wls.heterosked.example # Outputs: standard deviation of estimated intercept and slope wls.heterosked.error.stats = function(n,m=10000) { wls.errors.raw = t(replicate(m,wls.heterosked.example(n))) # transpose gives us a matrix with named columns intercept.sd = sd(wls.errors.raw[,"(Intercept)"]) slope.sd = sd(wls.errors.raw[,"x"]) return(list(intercept.sd=intercept.sd,slope.sd=slope.sd)) } ## ----funnel-plot, echo=FALSE--------------------------------------------- mobility <- read.csv("http://www.stat.cmu.edu/~cshalizi/mreg/15/lectures/24--25/mobility2.csv") plot(Mobility ~ Population, data=mobility, log="x", ylim=c(0,0.5)) ## ----eval=FALSE---------------------------------------------------------- ## mobility <- read.csv("http://www.stat.cmu.edu/~cshalizi/mreg/15/lectures/24--25/mobility2.csv") ## plot(Mobility ~ Population, data=mobility, log="x", ylim=c(0,0.5)) ## ------------------------------------------------------------------------ mobility$MobSE <- with(mobility, sqrt(Mobility*(1-Mobility)/Population)) ## ----mobility-scatterplot, echo=FALSE------------------------------------ plot(Mobility ~ Commute, data=mobility, xlab="Fraction of workers with short commutes", ylab="Rate of economic mobility", pch=19, cex=0.2) with(mobility, segments(x0=Commute, y0=Mobility+2*MobSE, x1=Commute, y1=Mobility-2*MobSE, col="blue")) mob.lm <- lm(Mobility ~ Commute, data=mobility) mob.wlm <- lm(Mobility ~ Commute, data=mobility, weight=1/MobSE^2) abline(mob.lm) abline(mob.wlm, col="blue") ## ----eval=FALSE---------------------------------------------------------- ## plot(Mobility ~ Commute, data=mobility, ## xlab="Fraction of workers with short commutes", ## ylab="Rate of economic mobility", pch=19, cex=0.2) ## with(mobility, segments(x0=Commute, y0=Mobility+2*MobSE, ## x1=Commute, y1=Mobility-2*MobSE, col="blue")) ## mob.lm <- lm(Mobility ~ Commute, data=mobility) ## mob.wlm <- lm(Mobility ~ Commute, data=mobility, weight=1/MobSE^2) ## abline(mob.lm) ## abline(mob.wlm, col="blue") ## ----ols-residuals-with-smooth, echo=FALSE------------------------------- plot(x,residuals(fit.ols)^2,ylab="squared residuals") curve((1+x^2/2)^2,col="grey",add=TRUE) var1 <- smooth.spline(x=x, y=log(residuals(fit.ols)^2), cv=TRUE) grid.x <- seq(from=min(x),to=max(x),length.out=300) lines(grid.x, exp(predict(var1,x=grid.x)$y)) ## ------------------------------------------------------------------------ fit.wls1 <- lm(y~x,weights=1/exp(var1$y)) coefficients(fit.wls1) var2 <- smooth.spline(x=x, y=log(residuals(fit.wls1)^2), cv=TRUE) ## ----fit-wls1,echo=FALSE------------------------------------------------- fit.wls1 <- lm(y~x,weights=1/exp(var1$y)) par(mfrow=c(1,2)) plot(x,y) abline(a=3,b=-2,col="grey") abline(fit.ols,lty="dashed") abline(fit.wls1,lty="dotted") plot(x,(residuals(fit.ols))^2,ylab="squared residuals") points(x,residuals(fit.wls1)^2,pch=15) lines(grid.x, exp(predict(var1,x=grid.x)$y)) var2 <- smooth.spline(x=x, y=log(residuals(fit.wls1)^2), cv=TRUE) curve((1+x^2/2)^2,col="grey",add=TRUE) lines(grid.x, exp(predict(var2,x=grid.x)$y),lty="dotted") par(mfrow=c(1,1)) ## ----fit-wls2------------------------------------------------------------ fit.wls2 <- lm(y~x,weights=1/exp(var2$y)) coefficients(fit.wls2) var3 <- smooth.spline(x=x, y=log(residuals(fit.wls2)^2), cv=TRUE) ## ----fit-wls3_and_4------------------------------------------------------ fit.wls3 <- lm(y~x,weights=1/exp(var3$y)) coefficients(fit.wls3) var4 <- smooth.spline(x=x, y=log(residuals(fit.wls3)^2), cv=TRUE) fit.wls4 <- lm(y~x,weights=1/exp(var4$y)) coefficients(fit.wls4) ## ------------------------------------------------------------------------ iterative.wls <- function(x,y,tol=0.01,max.iter=100) { iteration <- 1 old.coefs <- NA regression <- lm(y~x) coefs <- coefficients(regression) while (is.na(old.coefs) || ((max(abs(coefs - old.coefs)) > tol) && (iteration < max.iter))) { variance <- smooth.spline(x=x, y=log(residuals(regression)^2), cv=TRUE) old.coefs <- coefs iteration <- iteration+1 regression <- lm(y~x,weights=1/exp(variance$y)) coefs <- coefficients(regression) } return(list(regression=regression,variance=variance,iterations=iteration)) }