# looking briefly at the mosteller-tukey data... schools <- read.table("mosteller-tukey.txt") # The data set collects average test scores and other background # varaibles for 20 schools: # # x1 -- teacher salary per pupil # x2 -- % of fathers white-collar # x3 -- SES measure # x4 -- teachers' verbal scores # x5 -- mothers' schooling # z -- average 6th grade students' verbal scores in each school # y -- y=1 if z>37; y=0 if z<37 # We start with a simple "main effects" logistic regression model... summary(fit0 <- glm(y ~ x1 + x2 + x3 +x4 + x5,data=schools,family=binomial)) # Deviance Residuals: # Min 1Q Median 3Q Max # -2.234e+00 -8.112e-02 -8.213e-05 3.263e-01 8.441e-01 # # Coefficients: # Estimate Std. Error z value Pr(>|z|) # (Intercept) -4.5635 33.1771 -0.138 0.891 # x1 2.1346 3.3235 0.642 0.521 # x2 0.1135 0.1592 0.713 0.476 # x3 0.9789 0.8487 1.153 0.249 # x4 2.0242 1.3251 1.528 0.127 # x5 -10.0928 9.7992 -1.030 0.303 # # (Dispersion parameter for binomial family taken to be 1) # # Null deviance: 27.526 on 19 degrees of freedom # Residual deviance: 8.343 on 14 degrees of freedom # AIC: 20.343 # the residuals (below) look "varietal" for logistic regression! # (i.e. strongly patterned and difficult to visually interpret...), # even though the model fit is apparently pretty good (deviance=8.3 on # 14 df) par(mfrow=c(2,2)) plot(fit0,1:4) ######################################################### # # Although the fit is good, none of the coefficients are # significant... The reasons are the usual suspects in # regression analysis: # * sample size is small: n=20 # * collinearity in the predictors, as shown next: X <- model.matrix(fit0)[,-1] round(cor(X),2) # x1 x2 x3 x4 x5 # x1 1.00 0.18 0.23 0.50 0.20 -- teacher salary per pupil # x2 0.18 1.00 0.83 0.05 0.93 -- % of fathers white-collar # x3 0.23 0.83 1.00 0.18 0.82 -- SES measure # x4 0.50 0.05 0.18 1.00 0.12 -- teachers' verbal scores # x5 0.20 0.93 0.82 0.12 1.00 -- mothers' schooling round(eigen(t(X)%*%X)\$values,2) # [1] 57545.10 3697.31 465.89 3.11 1.80 summary(prcomp(X,center=F)) # Importance of components: # PC1 PC2 PC3 PC4 PC5 # Standard deviation 55.034 13.9497 4.95184 0.40442 0.30806 # Proportion of Variance 0.932 0.0599 0.00755 0.00005 0.00003 # Cumulative Proportion 0.932 0.9924 0.99992 0.99997 1.00000 # So we expect to see only two or three non-collinear predictors in X. # # We can try stepwise variable selection to see which x's seem to # provide a good model: library(MASS) basemodel <- glm(y ~ x1 + x2 + x3 +x4 + x5 ,data=schools,family=binomial) fit1 <- eval(stepAIC(basemodel, scope=list(lower=.~ 1,upper=.~x1 + x2 + x3 +x4 + x5,k=2))\$call) anova(fit1,fit0,test="Chisq") # Analysis of Deviance Table # # Model 1: y ~ x3 + x4 # Model 2: y ~ x1 + x2 + x3 + x4 + x5 # Resid. Df Resid. Dev Df Deviance P(>|Chi|) # 1 17 10.1414 # 2 14 8.3429 3 1.7984 0.6153 # So it looks like a model that only involves SES and teachers' verbal # achievement scores works about as well as the model with all # variables. # We might also try to add in interactions, but then something # interesting happens... fit2 <- eval(stepAIC(basemodel, scope=list(lower=.~ 1,upper=.~(x1 + x2 + x3 +x4 + x5)^5,k=2))\$call) # After much output we find the model y ~ x3 + x4*x5: # # y ~ x3 + x4 + x5 + x4:x5 # [...] # There were 50 or more warnings (use warnings() to see the first 50) # # Consulting the warnings we find: warnings() # Warning messages: # 1: algorithm did not converge in: # glm.fit(X, y, wt, offset = object\$offset, family = object\$family, ... # 2: fitted probabilities numerically 0 or 1 occurred in: # glm.fit(X, y, wt, offset = object\$offset, family = object\$family, ... # 3: algorithm did not converge in: # glm.fit(X, y, wt, offset = object\$offset, family = object\$family, ... # 4: fitted probabilities numerically 0 or 1 occurred in: # glm.fit(X, y, wt, offset = object\$offset, family = object\$family, ... # etc. # # If we compare fitted to observed values of y, we see the problem: round(cbind(schools[,1:6],fits=fitted(fit2)),8) # x1 x2 x3 x4 x5 y fits # 1 3.83 28.87 7.20 26.60 6.19 1 1 # 2 2.89 20.10 -11.71 24.40 5.17 0 0 # 3 2.86 69.05 12.32 25.70 7.04 0 0 # 4 2.92 65.40 14.28 25.70 7.10 1 1 # 5 3.06 29.59 6.31 25.40 6.15 1 1 # 6 2.07 44.82 6.16 21.60 6.41 0 0 # 7 2.52 77.37 12.70 24.90 6.86 1 1 # 8 2.45 24.67 -0.17 25.01 5.78 0 0 # 9 3.13 65.01 9.85 26.60 6.51 1 1 # 10 2.44 9.99 -0.05 28.01 5.57 1 1 # 11 2.09 12.20 -12.86 23.51 5.62 0 0 # 12 2.52 22.55 0.92 23.60 5.34 0 0 # 13 2.22 14.30 4.77 24.51 5.80 0 0 # 14 2.67 31.79 -0.96 25.80 6.19 0 0 # 15 2.71 11.60 -16.04 25.20 5.62 0 0 # 16 3.14 68.47 10.62 25.01 6.94 1 1 # 17 3.54 42.64 2.66 25.01 6.33 0 0 # 18 2.52 16.70 -10.99 24.80 6.01 0 0 # 19 2.68 86.27 15.03 25.51 7.51 1 1 # 20 2.37 76.73 12.77 24.51 6.96 1 1 # # so the fitted values (p-hat's) are all 0 or 1, to eight or more # decimal places (and have completely nailed the observed y's). In # other words, y ~ x3 + x4*x5 is essentially already the saturated # model for this data. # # This is a standard problem with logistic regression: the saturated # model really can't be fitted ("algorithm did not converge" above) # because in the expression # # log p/(1-p) = b0 + b1*x1 + b2*x2 + ... # # the expression on the left becomes infinite as p approaches 0 or 1. ####################################################################### # For comparison we also try model selection using the z scores, and # standard normal-errors linear regression: basemodel <- lm(z ~ x1 + x2 + x3 +x4 + x5 ,data=schools) norm1 <- eval(stepAIC(basemodel, scope=list(lower=.~ 1,upper=.~(x1 + x2 + x3 +x4 + x5)^5), k=2)\$call) # k=2 for AIC norm1\$call # lm(formula = z ~ x1 + x3 + x4, data = schools) norm2 <- eval(stepAIC(basemodel, scope=list(lower=.~ 1,upper=.~(x1 + x2 + x3 +x4 + x5)^5), k=log(20))\$call) # k=log(sample size) for BIC norm2\$call # lm(formula = z ~ x3 + x4, data = schools) # It is interesting to note that AIC and BIC stepwise procedures chose # models very close to the model y ~ x3 + x4 for logistic regression, # even though we gave access to interactions of all orders. This # suggests that the x4*x5 interaction (which was selected by stepAIC # from logistic regression models of all orders) was useful for # predicting the simpler response surface of y (which is a dichotomized # version of z) than for predicting the more complex response surface of # z itself. # # Dichotomization always changes the information in the data. Sometimes # dichotomization is necessary for the question being asked (is this # school performing "well enough"?) but even then the place we # dichotomize (z=37 was the cutoff between y=0 and y=1 here) matters. I # know of no good "general rules" for dichotomization; and I'd suggest a # sensitivity analysis (try different cutoffs and see how that affects # the results) if you do need to dichotomize for some purpose. ##################################################################