# looking briefly at the mosteller-tukey data...
schools <- read.table("mosteller-tukey.txt")
# The data set collects average test scores and other background
# varaibles for 20 schools:
#
# x1 -- teacher salary per pupil
# x2 -- % of fathers white-collar
# x3 -- SES measure
# x4 -- teachers' verbal scores
# x5 -- mothers' schooling
# z -- average 6th grade students' verbal scores in each school
# y -- y=1 if z>37; y=0 if z<37
# We start with a simple "main effects" logistic regression model...
summary(fit0 <- glm(y ~ x1 + x2 + x3 +x4 + x5,data=schools,family=binomial))
# Deviance Residuals:
# Min 1Q Median 3Q Max
# -2.234e+00 -8.112e-02 -8.213e-05 3.263e-01 8.441e-01
#
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)
# (Intercept) -4.5635 33.1771 -0.138 0.891
# x1 2.1346 3.3235 0.642 0.521
# x2 0.1135 0.1592 0.713 0.476
# x3 0.9789 0.8487 1.153 0.249
# x4 2.0242 1.3251 1.528 0.127
# x5 -10.0928 9.7992 -1.030 0.303
#
# (Dispersion parameter for binomial family taken to be 1)
#
# Null deviance: 27.526 on 19 degrees of freedom
# Residual deviance: 8.343 on 14 degrees of freedom
# AIC: 20.343
# the residuals (below) look "varietal" for logistic regression!
# (i.e. strongly patterned and difficult to visually interpret...),
# even though the model fit is apparently pretty good (deviance=8.3 on
# 14 df)
par(mfrow=c(2,2))
plot(fit0,1:4)
#########################################################
#
# Although the fit is good, none of the coefficients are
# significant... The reasons are the usual suspects in
# regression analysis:
# * sample size is small: n=20
# * collinearity in the predictors, as shown next:
X <- model.matrix(fit0)[,-1]
round(cor(X),2)
# x1 x2 x3 x4 x5
# x1 1.00 0.18 0.23 0.50 0.20 -- teacher salary per pupil
# x2 0.18 1.00 0.83 0.05 0.93 -- % of fathers white-collar
# x3 0.23 0.83 1.00 0.18 0.82 -- SES measure
# x4 0.50 0.05 0.18 1.00 0.12 -- teachers' verbal scores
# x5 0.20 0.93 0.82 0.12 1.00 -- mothers' schooling
round(eigen(t(X)%*%X)$values,2)
# [1] 57545.10 3697.31 465.89 3.11 1.80
summary(prcomp(X,center=F))
# Importance of components:
# PC1 PC2 PC3 PC4 PC5
# Standard deviation 55.034 13.9497 4.95184 0.40442 0.30806
# Proportion of Variance 0.932 0.0599 0.00755 0.00005 0.00003
# Cumulative Proportion 0.932 0.9924 0.99992 0.99997 1.00000
# So we expect to see only two or three non-collinear predictors in X.
#
# We can try stepwise variable selection to see which x's seem to
# provide a good model:
library(MASS)
basemodel <- glm(y ~ x1 + x2 + x3 +x4 + x5 ,data=schools,family=binomial)
fit1 <- eval(stepAIC(basemodel,
scope=list(lower=.~ 1,upper=.~x1 + x2 + x3 +x4 + x5,k=2))$call)
anova(fit1,fit0,test="Chisq")
# Analysis of Deviance Table
#
# Model 1: y ~ x3 + x4
# Model 2: y ~ x1 + x2 + x3 + x4 + x5
# Resid. Df Resid. Dev Df Deviance P(>|Chi|)
# 1 17 10.1414
# 2 14 8.3429 3 1.7984 0.6153
# So it looks like a model that only involves SES and teachers' verbal
# achievement scores works about as well as the model with all
# variables.
# We might also try to add in interactions, but then something
# interesting happens...
fit2 <- eval(stepAIC(basemodel,
scope=list(lower=.~ 1,upper=.~(x1 + x2 + x3 +x4 + x5)^5,k=2))$call)
# After much output we find the model y ~ x3 + x4*x5:
#
# y ~ x3 + x4 + x5 + x4:x5
# [...]
# There were 50 or more warnings (use warnings() to see the first 50)
#
# Consulting the warnings we find:
warnings()
# Warning messages:
# 1: algorithm did not converge in:
# glm.fit(X, y, wt, offset = object$offset, family = object$family, ...
# 2: fitted probabilities numerically 0 or 1 occurred in:
# glm.fit(X, y, wt, offset = object$offset, family = object$family, ...
# 3: algorithm did not converge in:
# glm.fit(X, y, wt, offset = object$offset, family = object$family, ...
# 4: fitted probabilities numerically 0 or 1 occurred in:
# glm.fit(X, y, wt, offset = object$offset, family = object$family, ...
# etc.
#
# If we compare fitted to observed values of y, we see the problem:
round(cbind(schools[,1:6],fits=fitted(fit2)),8)
# x1 x2 x3 x4 x5 y fits
# 1 3.83 28.87 7.20 26.60 6.19 1 1
# 2 2.89 20.10 -11.71 24.40 5.17 0 0
# 3 2.86 69.05 12.32 25.70 7.04 0 0
# 4 2.92 65.40 14.28 25.70 7.10 1 1
# 5 3.06 29.59 6.31 25.40 6.15 1 1
# 6 2.07 44.82 6.16 21.60 6.41 0 0
# 7 2.52 77.37 12.70 24.90 6.86 1 1
# 8 2.45 24.67 -0.17 25.01 5.78 0 0
# 9 3.13 65.01 9.85 26.60 6.51 1 1
# 10 2.44 9.99 -0.05 28.01 5.57 1 1
# 11 2.09 12.20 -12.86 23.51 5.62 0 0
# 12 2.52 22.55 0.92 23.60 5.34 0 0
# 13 2.22 14.30 4.77 24.51 5.80 0 0
# 14 2.67 31.79 -0.96 25.80 6.19 0 0
# 15 2.71 11.60 -16.04 25.20 5.62 0 0
# 16 3.14 68.47 10.62 25.01 6.94 1 1
# 17 3.54 42.64 2.66 25.01 6.33 0 0
# 18 2.52 16.70 -10.99 24.80 6.01 0 0
# 19 2.68 86.27 15.03 25.51 7.51 1 1
# 20 2.37 76.73 12.77 24.51 6.96 1 1
#
# so the fitted values (p-hat's) are all 0 or 1, to eight or more
# decimal places (and have completely nailed the observed y's). In
# other words, y ~ x3 + x4*x5 is essentially already the saturated
# model for this data.
#
# This is a standard problem with logistic regression: the saturated
# model really can't be fitted ("algorithm did not converge" above)
# because in the expression
#
# log p/(1-p) = b0 + b1*x1 + b2*x2 + ...
#
# the expression on the left becomes infinite as p approaches 0 or 1.
#######################################################################
# For comparison we also try model selection using the z scores, and
# standard normal-errors linear regression:
basemodel <- lm(z ~ x1 + x2 + x3 +x4 + x5 ,data=schools)
norm1 <- eval(stepAIC(basemodel,
scope=list(lower=.~ 1,upper=.~(x1 + x2 + x3 +x4 + x5)^5),
k=2)$call) # k=2 for AIC
norm1$call
# lm(formula = z ~ x1 + x3 + x4, data = schools)
norm2 <- eval(stepAIC(basemodel,
scope=list(lower=.~ 1,upper=.~(x1 + x2 + x3 +x4 + x5)^5),
k=log(20))$call) # k=log(sample size) for BIC
norm2$call
# lm(formula = z ~ x3 + x4, data = schools)
# It is interesting to note that AIC and BIC stepwise procedures chose
# models very close to the model y ~ x3 + x4 for logistic regression,
# even though we gave access to interactions of all orders. This
# suggests that the x4*x5 interaction (which was selected by stepAIC
# from logistic regression models of all orders) was useful for
# predicting the simpler response surface of y (which is a dichotomized
# version of z) than for predicting the more complex response surface of
# z itself.
#
# Dichotomization always changes the information in the data. Sometimes
# dichotomization is necessary for the question being asked (is this
# school performing "well enough"?) but even then the place we
# dichotomize (z=37 was the cutoff between y=0 and y=1 here) matters. I
# know of no good "general rules" for dichotomization; and I'd suggest a
# sensitivity analysis (try different cutoffs and see how that affects
# the results) if you do need to dichotomize for some purpose.
##################################################################