library("foreign") # for read.dta() -- reads STATA .dta files...
kidiq <- read.dta("child-iq/kidiq.dta",convert.underscore=TRUE)
str(kidiq)
View(kidiq)
attach(kidiq)
##############################################################
#
# For a binary (0/1) predictor, the regression coefficient is the
# difference between the two group means
#
# This easy interpretation is a big reason many statisticians like
# to code two-level factors with 0/1 predictors.
plot(kid.score ~ mom.hs,xlab="Mom Finished HS? 0 = No, 1 = Yes",
ylab="Child's Test Score",xlim=c(-0.5,1.5))
boxplot(kid.score ~ mom.hs,at=0:1,add=TRUE,axes=FALSE)
boxplot(split(kid.score,mom.hs),at=0:1,add=TRUE,axes=FALSE)
means <- sapply(split(kid.score,mom.hs),mean)
lines(c(-0.25,0.25),rep(means[1],2),col="Blue")
lines(c(0.75,1.25),rep(means[2],2),col="Blue")
fit.lm.1 <- lm(kid.score ~ mom.hs)
# (1) we are fitting the model
#
# kid.score = (intercept) + (regression coefficient)*mom.hs + (error)
#
# (2) We don't have to specify the intercept, or even the
# equation. We just specify the y variable to the left of
# the ~, and the x variable to the right of the ~.
abline(fit.lm.1)
print(fit.lm.1)
summary(fit.lm.1)
library(arm) # this is a library of useful helper functions that
# "comes with" the Gelman & Hill book
display(fit.lm.1) # from library "arm" [special library for G&H]
coef(fit.lm.1)
###############################################################
#
# Note that the intercept is the mean of the group at mom.hs=0,
# and the sum of the intercept and regression coefficient is the
# mean of the group at mom.hs=1
#
means[1]
coef(fit.lm.1)[1]
means[2]
coef(fit.lm.1)[1] + coef(fit.lm.1)[2]
points(0,coef(fit.lm.1)[1],pch=19,cex=2,col="Red")
points(1,coef(fit.lm.1)[1]+coef(fit.lm.1)[2],pch=19,cex=2,col="Red")
###############################################################
#
# (aside: jitter can be an effective way of plotting data)
#
n <- length(mom.hs)
mom.hs.jit <- mom.hs + runif(n,-0.1,0.1)
plot(kid.score ~ mom.hs.jit,xlab="Mom Finished HS? 0 = No, 1 = Yes",
ylab="Child's Test Score",xlim=c(-0.5,1.5))
# or, we can use the built-in "jitter" function:
plot(kid.score ~ jitter(mom.hs),xlab="Mom Finished HS? 0 = No, 1 = Yes",
ylab="Child's Test Score",xlim=c(-0.5,1.5))
help(jitter)
mom.hs.jit <- jitter(mom.hs, amount = -0.1) # same as "by hand" above
##############################################################
#
# For a continuous predictor, the regression coefficient is the
# expected difference in y values for a change of 1 unit in x
plot(kid.score ~ mom.iq,xlab="Mom's IQ Score",ylab="Child's Test Score")
fit.lm.2 <- lm(kid.score ~ mom.iq)
display(fit.lm.2)
abline(fit.lm.2)
#############################################################
#
# Two interpretations for the fact that the coefficient on mom.iq
# is 0.61:
#
# PREDICTIVE:
#
# When comparing two groups of children whose moms differ in IQ by
# one unit, we expect to see a 0.61 difference in the average
# test scores of the two groups of kids
#
# COUNTERFACTUAL:
#
# If we could increase the IQ of this mother by 1 unit, we would
# see an average increase in the child's test score of 0.61.
#
# Easy to blend these two, but important to keep them distinct.
# It may not be meaningful to think of changing the predictor
# variable!
#############################################################
#
# We can think of the fitted line as a function ...
#
plot(kid.score ~ mom.iq,xlab="Mom's IQ Score",ylab="Child's Test Score")
fitted.line <- function(x) {
coef(fit.lm.2)[1] + coef(fit.lm.2)[2]*x
}
for (x in (7:13)*10) {
print(fitted.line(x))
}
fitted.line((7:13)*10)
curve(fitted.line,add=T) # note this neat use of "curve" with a function!
points((7:13)*10,fitted.line((7:13)*10),pch=19,col="Red")
# What are these points we have plotted?
#
# Two interpretations for fitted.line(70) = 68.50:
#
# ASSOCIATIVE/PREDICTIVE:
#
# 68.50 is this model's estimate of the mean test score of
# all kid's whose mothers with an IQ of 70.
#
# FUNCTIONAL:
#
# Give me a mother with a test sore 70, my best prediction
# of her child's test score is 68.50.
#
#############################################################
#
# What does fitted.line(0) mean?
#
# fitted.line(0) = 25.80
#
# mean of kids test scores whose moms have IQ 0??
#
# best prediction of test score for a kid whose mom has IQ 0??
#
# neither makes much sense!
#
# Can accept the intercept as a mathematical fact of life, ORRR...
# can rewrite the variables so that the intercept makes sense!
(mean(mom.iq))
mom.iq.c <- mom.iq - mean(mom.iq)
fit.lm.3 <- lm(kid.score ~ mom.iq.c)
display(fit.lm.3)
# now the intercept tells us the expected value of kids' scores whose moms
# have IQ at the mean: 100.
plot(kid.score ~ mom.iq.c,xlab="Centered Mom's IQ (\"0\" means IQ=100)",
ylab="Kids' Test Scores")
abline(fit.lm.3)
abline(v=0,col="Grey")
abline(h=coef(fit.lm.3)[1],col="Grey",lty=5)
# sometimes it makes sense to divide by the SD as well
mean(mom.iq)
sd(mom.iq)
mom.iq.z <- (mom.iq - mean(mom.iq))/sd(mom.iq)
fit.lm.4 <- lm(kid.score ~ mom.iq.z)
display(fit.lm.4)
plot(kid.score ~ mom.iq.z,xlab="Mom's IQ Z-score",
ylab="Kids' Test Scores")
abline(fit.lm.4)
abline(v=0,col="Grey")
abline(h=coef(fit.lm.4)[1],col="Grey",lty=5)
# the interpretation of the intercept is still the average
# kids' score for moms at the mean IQ (100)
#
# but now the interpretation of the regression coefficient
# is the expected change in kids' score, for a 1 SD change
# in mothers' IQ.
# (aside: in order to make the size of coefficients of continuous
# predictors more comparable to coefficients of binary predictors,
# G&H actually prefer the standardization
# (mom.iq - mean(mom.iq))/(2*SD(mom.iq))
# However, for many purpose, just the z-score standardization is fine.
###############################################################
#
# More than one predictor...
#
fit.lm.5 <- lm(kid.score ~ mom.hs + mom.iq)
# (1) now we are fitting the model
#
# kid.score = (intcpt) + (coef1)*mom.hs + (coef2)*mom.iq + (error)
#
# (2) We don't have to specify the intercept, or even the
# equation. We just specify the y variable to the left of
# the ~, and the two x variables to the right of the ~.
display(fit.lm.5)
# The interpretation of the coefficients is that
#
# (intercept): the y-intercept (average kid score) for kids
# whose moms did not attend hs (mom.hs==0)
#
# (intcpt+coef1): the y-intercept (avg kid score) for kids
# whose moms *did* attend hs (mom.hs==1)
#
# (coef2): change in kid score for a 1-unit change in mom's iq,
# "holding all other predictors fixed!"
#
# Given these interpretations, we can make a plot of the data and the
# fitted regression equation
plot(kid.score ~ mom.iq,col=c("Red","Blue")[mom.hs+1],
xlab="Mom's IQ Scores",ylab="Kids' Test Scores")
abline(coef(fit.lm.5)[1],coef(fit.lm.5)[3],col="Red")
abline(coef(fit.lm.5)[1]+coef(fit.lm.5)[2],coef(fit.lm.5)[3],col="Blue")
legend(120,40,pch=1,col=c("Red","Blue"),legend=c("HS no","HS yes"))
hs.yes <- mom.hs == 1
hs.no <- mom.hs == 0
par(mfrow=c(2,1))
plot(kid.score[hs.yes] ~ mom.iq[hs.yes],col="Blue",
xlab="Mom's IQ Scores",ylab="Kids' Test Scores")
abline(coef(fit.lm.5)[1]+coef(fit.lm.5)[2],coef(fit.lm.5)[3],col="Blue")
legend(130,60,pch=1,col="Blue",legend="HS yes",cex=0.75)
plot(kid.score[hs.no] ~ mom.iq[hs.no],col="Red",
xlab="Mom's IQ Scores",ylab="Kids' Test Scores")
abline(coef(fit.lm.5)[1],coef(fit.lm.5)[3],col="Red")
legend(120,60,pch=1,col="Red",legend="HS no",cex=0.75)
# (could've done that with lattice plots...)
# It appears the data might be better fitted if we allow an interaction,
# so...
fit.lm.6 <- lm(kid.score ~ mom.hs + mom.iq + mom.hs:mom.iq)
# same as before but note use of ":" to denote multiplying
# two predictors together. This is a historical accident
# that goes back to the program "glim" in the 1970's...
display(fit.lm.6)
# Now the model is
#
# kid.score = (intcpt) + (coef1)*mom.hs +
# (coef2)*mom.iq + (coef3)*mom.hs*mom.iq + (error)
#
# interpretation:
#
# (intcpt) = intercept in group 0 [Mom HS no]
# (coef2) = slope (change in y per unit change in x) for group 0
# (coef1) = change in intercept from group 0 to group 1
# (coef3) = change in slope from group 0 to group 1
#
# again we can use this interpretation to produce a plot:
par(mfrow=c(1,1))
plot(kid.score ~ mom.iq,col=c("Red","Blue")[mom.hs+1],
xlab="Mom's IQ Scores",ylab="Kids' Test Scores")
intcpt0 <- coef(fit.lm.6)[1]
slope0 <- coef(fit.lm.6)[3]
intcpt1 <- coef(fit.lm.6)[1] + coef(fit.lm.6)[2]
slope1 <- coef(fit.lm.6)[3] + coef(fit.lm.6)[4]
abline(intcpt0,slope0,col="Red")
abline(intcpt1,slope1,col="Blue")
legend(120,40,pch=1,col=c("Red","Blue"),legend=c("HS no","HS yes"))
# (note that the relationship seems stronger in cases where moms did not
# have hs.... does that make sense?
# as a shorthand (again from glim days!), we can replace a + b + a:b with
# the simpler notation a*b.
fit.lm.7 <- lm(kid.score ~ mom.hs*mom.iq)
display(fit.lm.6)
display(fit.lm.7)
###############################################################
#
# Some basic assumptions to check when fitting a linear model
#
# (1) validity - does the general setup of the linear model match up
# with the research question you are asking? Do the variables you
# have to work with really bear on the questions you want answers to?
#
# (2) linearity and additive errors
# * does it make sense to add contributions of x, y, x:y, etc., to
# build up a model? (or is the relationship multiplicative or
# nonlineaer, instead of linear?)
#
# (3) Errors independent? (no serial correlations)
#
# (4) Equal variance of errors? (they look like they all came from the
# same distribution?)
#
# (5) errors normally distributed? (do they look like they came from a
# symmetric, unimodal distribution with few or no outliers?)
#
# all except #1 (and possibly #2) above can be checked with standard
# plots that R makes available.
par(mfrow=c(2,2))
plot(fit.lm.6)
# the residual plot is great for inspecting independence of errors and
# any violations of linearity in the model; sometimes missing
# predictors can also be detected.
#
# the normal qq plot is good for assessing normality assumption 5
#
# the scale-location plot is good for eyeballing the equal-variance
# assumption (and, implicitly, the additive-errors assumption)
#
# the "residual vs leverage" plot indicates points that have undue
# influence on the fit. Points with high residuals *and* high
# leverage (upper and lower left corners of the plot) are ones to
# especially be concerned about
#
# We are always looking for "nothing exciting" in these plots.
# Nothing really "grabs" us, except possibly a couple of the more
# extremely outlying residuals. Since we are generally happy
# with the diagnostics, we can be doen.
plot(fit.lm.2)
plot(fit.lm.3)
plot(fit.lm.4)
plot(fit.lm.5)
# Note that the main difference in plots 2, 3, 4 is some nonlinearity
# and/or some nonconstant vararinace, as well as some indication of
# high-leverage points.
###############################################################
detach()
###############################################################
###############################################################