############ Chap. 1 - Agresti #################### ##### Discrete probability distributions #### rmultinomial<-function (n, p, rows = max(c(length(n), nrow(p)))) { n <- rep(n, length = rows) p <- p[rep(1:nrow(p), length = rows), , drop = FALSE] sapply(1:rows,function(i,n,p) { k <- length(p[i,]) tabulate(sample(k, n[i], replace = TRUE, prob = p[i,]), nbins = k) },n=n,p=p) } n <- c(100,20,10) p <- matrix(c(.3,.1,.5,.1,.1,.2,.6,.8,.3),3) rmultinomial(n,p) ##### Vegetarian Example ###### ## Wald interval phat<-0; n<-25 phat + c(-1,1)*qnorm(p=.025)*sqrt(phat*(1-phat)/n) library(Hmisc) binconf(x=0, n=25, method="asymptotic") ## Score interval pi.hat<-0 n<-25 alpha<-.05 anti.logit<-function(x) exp(x)/(1+exp(x)) Score.func <- function(logit.pi.0) { pi.0<-anti.logit(logit.pi.0) (pi.hat - pi.0) * sqrt(pi.0*(1-pi.0)/n) + c(1,-1)*qnorm(1-alpha/2) } # nlmin can be used to solve a system of nonlinear equations: solveNonlinear <- function(f, y0, x, ...){ # solve f(x) = y0 # x is vector of initial guesses, same length as y0 # ... are additional arguments to nlmin (not to f) g <- function(x, y0, f) sum((f(x) - y0)^2) g$y0 <- y0 # set g's default value for y0 g$f <- f # set g's default value for f nlmin(g, x, ...) } temp<-solveNonlinear(Score.func, y0=c(0,0), x=c(-10,-10),print.level=1) temp anti.logit(temp$x) logit<-function(p) log(p/(1-p)) # binconf binconf(0, 25, method="wilson") PointEst Lower Upper 0 1.202937e-017 0.1331923 # prop.test res<-prop.test(x=0,n=25,correct=F) res$conf.int [1] 0.0000000 0.1331923 attr(, "conf.level"): [1] 0.95 ## LR confidence interval The expression for the LR statistic is easy enough to compute the upper confidence bound analytically. However, we could have obtained it using uniroot LR <- function(pi.0, y, n, alpha) { -2*(y*log(pi.0) + (n-y)*log(1-pi.0)) -qchisq(1-alpha,df=1) } uniroot(LR, c(0.000001,.999999), n=25,y=0,alpha=.05) $root: [1] 0.07395187 $f.root: [1] -5.615436e-006 # or using nlmin solveNonlinear <- function(f, y0, x, ...){ # solve f(x) = y0 # x is vector of initial guesses, same length as y0 # ... are additional arguments to nlmin (not to f) g <- function(x, y0, f) sum((f(x) - y0)^2) g$y0 <- y0 # set g's default value for y0 g$f <- f # set g's default value for f nlmin(g, x, ...) } LR <- function(x) -50*log(1-x) solveNonlinear(LR, qchisq(.95,df=1), .5) ## Clopper-Pearson interval The function binom.test(x=0, n=25) in R gives Clopper-Pearson intervals. The binom.test in SPLUS doesnt give CI's. In SPLUS use binconf. library(Hmisc) binconf(x=0, n=25, alpha=.05,method="exact") ## Blaker interval acceptbin<-function(x, n, p){ # computes the acceptability of p when x is observed and X is Bin(n, p) # adapted from Blaker (2000) p1<-1-pbinom(x-1, n, p) p2<-pbinom(x, n, p) a1<-p1 + pbinom(qbinom(p1, n, p) - 1, n, p) a2<-p2 + 1 - pbinom(qbinom(1-p2, n, p), n, p) min(a1, a2) } acceptinterval<-function(x, n, level, tolerance=1e-04){ # computes acceptability interval for p at 1 - alpha equal to level # (in (0,1)) when x is an observed value of X which is Bin(n, p) lower<-upper<-0 if(x) { lower<-qbeta((1-level)/2, x, n-x+1) while(acceptbin(x, n, lower) < (1-level)) lower<-lower+tolerance } if(x!=n) { upper<-qbeta(1-(1-level)/2, x + 1, n - x) while(acceptbin(x, n, upper) < (1-level)) upper<-upper-tolerance } c(lower=lower, upper=upper) } acceptinterval(0, 25, .95) ### Mid p-value # we require the set of pi such that the mid-p-value exceeds 0.05 .05^(1/25) 1-.05^(1/25) ### Pearson's Statistic # mendel's theories res<-.pearson.x2(observed=c(6022,2001),expected=c(8023*.75,8023*.25)) 1-pchisq(res$X2,df=1) [1] 0.902528 ### LR chi-squared test # mendel's theories obs<-c(6022,2001) expected<-8023*c(.75,.25) 1-pchisq(2 * sum(obs * log(obs/expected)),df=1) ### Maximum likelihood estimation (accident data) obj.function<-function(p, y){ -(-4.292*p[3] + y[1]*log(p[1]) + y[2]*log(p[2]) + y[3]*log(p[3]-p[1]) + y[4]*log(3.292*p[3]-p[2])) } nlminb(start=c(11, 62, 2*(11+4)), obj.function, lower=c(0,0,0), upper=c(Inf, Inf, Inf), scale=c(1,1,10), y=c(11, 62, 4, 7)) $parameters: [1] 14.35228 57.89246 19.57129 $objective: [1] -216.6862 $message: [1] "RELATIVE FUNCTION CONVERGENCE" $grad.norm: [1] 0.00001071819 $iterations: [1] 22 obj.res<-deriv(~-(-4.292*pi1 + y1*log(la1) + y2*log(la2) + y3*log(pi1-la1) + y4*log(3.292*pi1-la2)), c("la1","la2","pi1"), function(la1, la2, pi1, y1, y2, y3, y4) NULL) obj.gr<-function(p, y){ la1<-p[1]; la2<-p[2]; pi1<-p[3]; y1<-y[1]; y2<-y[2]; y3<-y[3]; y4<-y[4] attr(obj.res(la1, la2, pi1, y1, y2, y3, y4), "gradient") } nlminb(start=c(11, 62, 2*(11+4)), objective=obj.function, gradient=obj.gr, lower=c(0,0,0), upper=c(Inf, Inf, Inf), scale=c(1,1,10), y=c(11, 62, 4, 7)) ############# Chap. 2 - Agresti ################### ###### proportion parameters for two samples ###### # Table 2.1 (attack v. no attack) x<-c(104,189) # aspirin, placebo n<-c(11037,11034) # test H0:p1=p2 (equal probabilities of heart attack) prop.test(x,n) prop.test(x,n)$p.value [1] 0 # Test H0:p1=p2 v. H1:p140000") jobsat<-c("VD","LD","MS","VS") table.2.8<-expand.grid(income=income,jobsat=jobsat) Data<-c(1,2,1,0,3,3,6,1,10,10,14,9,6,7,12,11) table.2.8<-cbind(table.2.8,count=Data) temp<-crosstabs(count~income+jobsat,table.2.8) # Function for computing gamma Gamma.f<-function(x) { # x is a matrix of counts. You can use output of crosstabs or xtabs. n<-nrow(x) m<-ncol(x) res<-numeric((n-1)*(m-1)) for(i in 1:(n-1)) { for(j in 1:(m-1)) res[j+(m-1)*(i-1)]<-x[i,j]*sum(x[(i+1):n,(j+1):m]) } C<-sum(res) res<-numeric((n-1)*(m-1)) iter<-0 for(i in 1:(n-1)) for(j in 2:m) { iter<-iter+1; res[iter]<-x[i,j]*sum(x[(i+1):n,1:(j-1)]) } D<-sum(res) gamma<-(C-D)/(C+D) list( gamma=gamma, C=C, D=D) } # Data from table 2.8 Gamma.f(temp) $gamma: [1] 0.2211009 $C: [1] 1331 $D: [1] 849 dos.time(Gamma.f(temp)) # Selvin(1998) computes the number of concordant and discordant pairs # using the outer() function along with ifelse statements. # However, the procedure is very memory intensive. The function above takes [1] 0.55 # CPU seconds. Selvin's takes over 30 seconds. ############# Chap. 3 - Agresti ################### #### Confidence intervals #### # Table 3.1 Drug<-c("Placebo","Aspirin") Infarction<-c("yes","no") table.3.1<-expand.grid(drug=Drug,infarction=Infarction) Data<-c(28,18,656,658) table.3.1<-cbind(table.3.1,count=Data) temp<-design.table(table.3.1) # Wald CI Wald.ci<-function(Table, aff.response, alpha=.05){ # Gives two-sided Wald CI's for odds ratio, difference in proportions and relative risk. # Table is a 2x2 table of counts with rows giving the treatment populations # aff.response is a string like "c(1,1)" giving the cell of the beneficial response and the treatment category # alpha is significance level pow<-function(x, a=-1) x^a z.alpha<-qnorm(1-alpha/2) if(is.character(aff.response)) where<-eval(parse(text=aff.response)) else where<-aff.response Next<-as.numeric(where==1) + 1 # OR odds.ratio<-Table[where[1],where[2]]*Table[Next[1],Next[2]]/(Table[where[1],Next[2]]*Table[Next[1],where[2]]) se.OR<-sqrt(sum(pow(Table))) ci.OR<-exp(log(odds.ratio) + c(-1,1)*z.alpha*se.OR) # difference of proportions p1<-Table[where[1],where[2]]/(n1<-Table[where[1],Next[2]] + Table[where[1],where[2]]) p2<-Table[Next[1],where[2]]/(n2<-Table[Next[1],where[2]]+Table[Next[1],Next[2]]) se.diff<-sqrt(p1*(1-p1)/n1 + p2*(1-p2)/n2) ci.diff<-(p1-p2) + c(-1,1)*z.alpha*se.diff # relative risk RR<-p1/p2 se.RR<-sqrt((1-p1)/(p1*n1) + (1-p2)/(p2*n2)) ci.RR<-exp(log(RR) + c(-1,1)*z.alpha*se.RR) list(OR=list(odds.ratio=odds.ratio, CI=ci.OR), proportion.difference=list(diff=p1-p2, CI=ci.diff), relative.risk=list(relative.risk=RR,CI=ci.RR)) } Wald.ci(temp, "c(1, 1)") # or use Wald.ci(temp, c(1, 1)) $OR: $OR$odds.ratio: [1] 1.560298 $OR$CI: [1] 0.8546703 2.8485020 $proportion.difference: $proportion.difference$diff: [1] 0.01430845 $proportion.difference$CI: [1] -0.004868983 0.033485890 $relative.risk: $relative.risk$relative.risk: [1] 1.537362 $relative.risk$CI: [1] 0.858614 2.752671 # Score CI f <- function(p) { c(-(28/p[1])+(656/(1-p[1]))-p[3], -(18/p[2])+(658/(1-p[2]))+p[3], p[1]-p[2]) } solveNonlinear <- function(f, y0, x,...){ # solve f(x) = y0 # x is vector of initial guesses, same length as y0 # ... are additional arguments to nlmin (not to f) g <- function(x, y0, f) sum((f(x) - y0)^2) g$y0 <- y0 # set g's default value for y0 g$f <- f # set g's default value for f nlmin(g, x, ...) } score.ci<-function(Delta) { temp<-solveNonlinear(f, y0=c(0,0,Delta), x=c(28/(28+656),18/(18+658),7),print.level=0,max.fcal=100,max.iter=100,init.step=.001) p<-temp$x[1:2] print(temp$x) z<-(p[1]-p[2]-Delta)/sqrt(p[1]*(1-p[1])/684 + p[2]*(1-p[2])/676) } Delta<-seq(-.2,.2,.001) z<-sapply(Delta,score.ci) Delta[abs(z)i & col(x)>j]) pi.d[i, j]<-sum(x[row(x)j]) + sum(x[row(x)>i & col(x)= S-PLUS 6.0) if(is.null(version$language) && inherits(x, "crosstabs")) { oldClass(x)<-NULL; attr(x, "marginals")<-NULL} n <- nrow(x) m <- ncol(x) pi.c<-pi.d<-matrix(0,nr=n,nc=m) row.x<-row(x) col.x<-col(x) for(i in 1:(n)){ for(j in 1:(m)){ pi.c[i, j]<-sum(x[row.xi & col.x>j]) pi.d[i, j]<-sum(x[row.xj]) + sum(x[row.x>i & col.x0 && any(names(env) == "")) stop(paste("all arguments are not named:", env)) fargs<-if(length(f) > 1) f[1:length(f) - 1] else NULL if(any(duplicated(names(fargs)))) stop(paste("duplicated arguments:",paste(names(fargs),collapse=", "))) fbody <- f[length(f)] cf<-c(fargs,fbody) mode(cf)<-"function" cf } hypergeometric.SPLUS <- function(n1, m1, N, psi) { n2 <- N - n1; if(n1<0 | n2<0 | m1<0 | m1>N | psi<=0) stop("wrong argument in hypergeometric"); ll <- max(0, m1-n2); uu <- min(n1, m1); prob <- array( 1, uu-ll+1 ); shift <- 1-ll; mode.compute <- substitute(function() { a <- psi - 1; b <- -( (m1+n1+2)*psi + n2-m1 ) ; c <- psi*(n1+1)*(m1+1); q <- b + sign(b)*sqrt(b*b-4*a*c); q <- -q/2; mode <- trunc(c/q); if(uu>=mode && mode>=ll) return(mode) else return( trunc(q/a) ); } , list(psi=psi, m1=m1, n1=n1,ll=ll, uu=uu, n2=n2)) mode <- mode.compute(); r.function <- substitute(function(i) (n1-i+1)*(m1-i+1)/i/(n2-m1+i)*psi, list(psi=psi, m1=m1, n1=n1,ll=ll, uu=uu, n2=n2)); if(modell) { r1 <- 1/r.function( mode:(ll+1) ); prob[ (mode-1 + shift):(ll + shift) ] <- cumprod(r1); } prob <- prob/sum(prob); ############################################################################# mean <- function() sum( prob[(ll:uu)+shift]*(ll:uu) ); var <- function() sum( prob[(ll:uu)+shift]*(ll:uu)^2 ) - mean()^2; d <- function(x) return(prob[x + shift]); p <- substitute(function(x, lower.tail=TRUE) { if(lower.tail) return( sum(prob[ll:(x+shift)]) ) else return( sum( prob[(x+shift):uu] ) ); },list(prob=prob, shift=shift, ll=ll, uu=uu)) #################################################### sample.low.to.high <- function(lower.end, ran) { for(i in lower.end:uu) { if(ran <= prob[i+shift]) return(i); ran <- ran - prob[i+shift]; } } sample.high.to.low <- function(upper.end, ran) { for(i in upper.end:ll) { if(ran <= prob[i+shift]) return(i); ran <- ran - prob[i+shift]; } } r <- function() { ran <- runif(1); if(mode==ll) return( sample.low.to.high(ll, ran) ); if(mode==uu) return( sample.high.to.low(uu, ran) ); if(ran < prob[mode+shift]) return(mode); ran <- ran - prob[mode+shift]; lower <- mode - 1; upper <- mode + 1; repeat { if(prob[upper + shift] >= prob[lower + shift]) { if(ran < prob[upper+shift]) return(upper); ran <- ran - prob[upper+shift]; if(upper == uu) return( sample.high.to.low(lower, ran) ); upper <- upper + 1; } else { if(ran < prob[lower+shift]) return(lower); ran <- ran - prob[lower+shift]; if(lower == ll) return( sample.low.to.high(upper, ran) ); lower <- lower - 1; } } } ########################################################################## MC(mean, list(prob=prob, shift=shift, ll=ll, uu=uu)) MC(var, list(prob=prob, shift=shift, ll=ll, uu=uu)) MC(d, list(prob=prob, shift=shift, ll=ll, uu=uu) ) MC(sample.low.to.high, list(prob=prob, shift=shift, ll=ll, uu=uu) ) MC(sample.high.to.low, list(prob=prob, shift=shift, ll=ll, uu=uu) ) MC(r, list(prob=prob, shift=shift, ll=ll, uu=uu) ) return(mean, var, d, p, r); } f<-function(x, alpha,t0){ resl<-hypergeometric.SPLUS(4,4,8,x[1]) resu<-hypergeometric.SPLUS(4,4,8,x[2]) sum(c(1-resl$p(t0-1) - alpha/2, resu$p(t0)-alpha/2)^2) } library(mass) res<-optim(par=c(.22, 622), fn=f, method="BFGS", alpha=.05, t0=3, control=list(parscale=c(1,100), trace=1)) $par: [1] 0.2117342 626.2385697 $value: [1] 1.345311e-013 $counts: function gradient 92 80 $convergence: [1] 0 $message: NULL # Using mid-p-value f<-function(x, alpha,t0){ resl<-hypergeometric.SPLUS(4,4,8,x[1]) resu<-hypergeometric.SPLUS(4,4,8,x[2]) sum(c(1-resl$p(t0-1) -.5*resl$d(t0) - alpha/2, resu$p(t0-1) + .5*resu$d(t0) -alpha/2)^2) } res<-optim(par=c(.22, 622), fn=f, method="BFGS", alpha=.05, t0=3, control=list(parscale=c(1,100), trace=1)) $par: [1] 0.3100547 308.5567363 $value: [1] 6.546435e-016 $counts: function gradient 57 46 $convergence: [1] 0 $message: NULL ############# Chap. 4 - Agresti ################### ## Generalized Linear Models for binary data # Table 4.2 n<-c(1379, 638, 213, 254) snoring<-rep(c(0,2,4,5),n) y<-rep(rep(c(1,0),4),c(24,1355,35,603,21,192,30,224)) # logit model logitreg <- function(x, y, wt = rep(1, length(y)), intercept = T, start = rep(0, p), ...) { if(!exists("optim")) library(mass) fmin <- function(beta, X, y, w) { p <- plogis(X %*% beta) -sum(2 * w * ifelse(y, log(p), log(1-p))) } gmin <- function(beta, X, y, w) { eta <- X %*% beta; p <- plogis(eta) t(-2 * (w *dlogis(eta) * ifelse(y, 1/p, -1/(1-p))))%*% X } if(is.null(dim(x))) dim(x) <- c(length(x), 1) dn <- dimnames(x)[[2]] if(!length(dn)) dn <- paste("Var", 1:ncol(x), sep="") p <- ncol(x) + intercept if(intercept) {x <- cbind(1, x); dn <- c("(Intercept)", dn)} if(is.factor(y)) y <- (unclass(y) != 1) fit <- optim(start, fmin, gmin, X = x, y = y, w = wt, ...) names(fit$par) <- dn cat("\nCoefficients:\n"); print(fit$par) cat("\nResidual Deviance:", format(fit$value), "\n") cat("\nConvergence message:", fit$convergence, "\n") invisible(fit) } (logit.fit<-logitreg(x=snoring, y=y, hessian=T, method="BFGS")) Coefficients: (Intercept) Var1 -3.866245 0.397335 Residual Deviance: 837.7316 Convergence message: 0 # linear probability fit lpmreg <- function(x, y, wt = rep(1, length(y)), intercept = T, start = rep(0, p), ...) { if(!exists("optim")) library(mass) fmin <- function(beta, X, y, w) { p <- X %*% beta -sum(2 * w * ifelse(y, log(p), log(1-p))) } gmin <- function(beta, X, y, w) { p <- X %*% beta; t(-2 * (w * ifelse(y, 1/p, -1/(1-p))))%*% X } if(is.null(dim(x))) dim(x) <- c(length(x), 1) dn <- dimnames(x)[[2]] if(!length(dn)) dn <- paste("Var", 1:ncol(x), sep="") p <- ncol(x) + intercept if(intercept) {x <- cbind(1, x); dn <- c("(Intercept)", dn)} if(is.factor(y)) y <- (unclass(y) != 1) fit <- optim(start, fmin, gmin, X = x, y = y, w = wt, ...) names(fit$par) <- dn cat("\nCoefficients:\n"); print(fit$par) cat("\nResidual Deviance:", format(fit$value), "\n") cat("\nConvergence message:", fit$convergence, "\n") invisible(fit) } (lpm.fit<-lpmreg(x=snoring, y=y, start=c(.05,.05), hessian=T, method="BFGS")) Coefficients: 1: NAs generated in: log(x) 2: NAs generated in: log(x) 3: NAs generated in: log(x) (Intercept) Var1 0.01724645 0.01977784 Residual Deviance: 834.9919 Convergence message: 0 # probit model probitreg <- function(x, y, wt = rep(1, length(y)), intercept = T, start = rep(0, p), ...) { if(!exists("optim")) library(mass) fmin <- function(beta, X, y, w) { p <- pnorm(X %*% beta) -sum(2 * w * ifelse(y, log(p), log(1-p))) } gmin <- function(beta, X, y, w) { eta <- X %*% beta; p <- pnorm(eta) t(-2 * (w *dnorm(eta) * ifelse(y, 1/p, -1/(1-p))))%*% X } if(is.null(dim(x))) dim(x) <- c(length(x), 1) dn <- dimnames(x)[[2]] if(!length(dn)) dn <- paste("Var", 1:ncol(x), sep="") p <- ncol(x) + intercept if(intercept) {x <- cbind(1, x); dn <- c("(Intercept)", dn)} if(is.factor(y)) y <- (unclass(y) != 1) fit <- optim(start, fmin, gmin, X = x, y = y, w = wt, ...) names(fit$par) <- dn cat("\nCoefficients:\n"); print(fit$par) cat("\nResidual Deviance:", format(fit$value), "\n") cat("\nConvergence message:", fit$convergence, "\n") invisible(fit) } (probit.fit<-probitreg(x=snoring, y=y, start=c(-3.87,.40), hessian=T, method="BFGS")) Coefficients: (Intercept) Var1 -2.06055 0.1877702 Residual Deviance: 836.7943 Convergence message: 0 # ASEs sqrt(diag(solve(logit.fit$hessian))) [1] 0.11753115 0.03536285 sqrt(diag(solve(lpm.fit$hessian))) [1] 0.002426329 0.001976457 sqrt(diag(solve(probit.fit$hessian))) [1] 0.04981505 0.01670894 # fitted probabilities eta<-cbind(1,snoring)%*%logit.fit$par logit.probs<-(exp(eta)/(1+exp(eta))) eta<-cbind(1,snoring)%*%lpm.fit$par lpm.probs<-(eta) eta<-cbind(1,snoring)%*%logit.fit$par probit.probs<-(pnorm(eta)) res<-cbind(logit=unique(logit.probs), lpm=unique(lpm.probs), probit=unique(probit.probs)) dimnames(res)[[1]]<-unique(snoring) logit lpm probit 0 0.02050626 0.01724271 0.00005524827 2 0.04428670 0.05683766 0.00106395563 4 0.09302543 0.09643260 0.01138590189 5 0.13239167 0.11623008 0.03005570088 # figure 4.1 snoring.plot<-unique(snoring) plot(snoring,logit.probs,type="n",xlim=c(0,5),ylim=c(-.005,.20),xlab="Level of Snoring", ylab="Predicted Probability", bty="L") lines(snoring.plot,unique(logit.probs),type="b",pch=16) lines(snoring.plot,unique(probit.probs),type="b",pch=17) lines(snoring.plot,unique(lpm.probs),type="l",lty=1) key(x=.5,y=.18,text=list(c("Logistic","Probit","Linear")), lines=list(type=c("b","b","l")),lty=c(1,1,1),pch=c(16,17,1),cex=.75,divide=3,border=T, text.width.multiplier=.8) # IRLS snoring<-c(0,2,4,5) prop<-c(24/1379, 35/638, 21/213, 30/254) # LPM (lpm.irls<-glm(prop~snoring, weights=c(1379,638,213,254), family=quasi(link=identity))) Coefficients: (Intercept) snoring 0.01687233 0.020038 Degrees of Freedom: 4 Total; 2 Residual Residual Deviance: 0.003977004 summary(lpm.irls)$coefficients Value Std. Error t value (Intercept) 0.01687233 0.0011341069 14.87719 snoring 0.02003800 0.0005094491 39.33269 # Logistic (logit.irls<-glm(cbind(yes=c(24,35,21,30), no=c(1355,603,192,224))~snoring, family=binomial(link=logit))) Coefficients: (Intercept) snoring -3.866248 0.3973366 Degrees of Freedom: 4 Total; 2 Residual Residual Deviance: 2.808912 summary(logit.irls)$coefficents Coefficients: Value Std. Error t value (Intercept) -3.8662481 0.16620356 -23.262125 snoring 0.3973366 0.05000865 7.945358 # Probit (probit.irls<-glm(cbind(yes=c(24,35,21,30), no=c(1355,603,192,224))~snoring, family=binomial(link=probit))) Coefficients: (Intercept) snoring -2.060552 0.1877704 Degrees of Freedom: 4 Total; 2 Residual Residual Deviance: 1.871561 summary(probit.irls)$coefficients Value Std. Error t value (Intercept) -2.0605515 0.07016609 -29.366769 snoring 0.1877704 0.02348045 7.996883 ### GLMs for Poisson counts # figure 4.3 table.4.3<-read.table("crab.ssc", col.names=c("C","S","W","Sa","Wt")) plot.table.4.3<-aggregate(rep(1,nrow(table.4.3)), list(Sa=table.4.3$Sa, W=table.4.3$W), sum) plot.table.4.3 <- convert.col.type(target = plot.table.4.3, column.spec = list("Sa"), column.type = "double") # more efficient: plot.table.4.3$Sa<-as.numeric(levels(plot.table.4.3$Sa))[plot.table.4.3$Sa] plot.table.4.3 <- convert.col.type(target = plot.table.4.3, column.spec = list("W"), column.type = "double") # more efficient: plot.table.4.3$W<-as.numeric(levels(plot.table.4.3$W))[plot.table.4.3$W] plot(y=plot.table.4.3$Sa,x=plot.table.4.3$W,xlab="Width (cm)", ylab="Number of Satellites", bty="L", axes=F, type="n") axis(2, at=1:15) axis(1, at=seq(20,34,2)) text(y=plot.table.4.3$Sa,x=plot.table.4.3$W,labels=plot.table.4.3$x) # figure 4.4 table.4.3$W.fac<-cut(table.4.3$W, breaks=c(0,seq(23.25, 29.25),Inf)) plot.y<-aggregate(table.4.3$Sa, by=list(W=table.4.3$W.fac), mean)$x plot.x<-aggregate(table.4.3$W, by=list(W=table.4.3$W.fac), mean)$x par(pty="s") plot(x=plot.x, y=plot.y, ylab="Number of Satellites", xlab="Width (cm)",bty="L", axes=F, type="p", pch=16) axis(2, at=0:5) axis(1, at=seq(20,34,2)) res<-gam(plot.y~s(plot.x, df=1.5), family=poisson(link=log), x=T) #res<-gam(plot.y~s(plot.x, df=5), family=poisson(link=log)) lines(x=plot.x,y=res$fitted.values) summary(res) plot.gam(res, se=T) # Poisson log linear model log.fit<-glm(Sa~W, family=poisson(link=log),data=table.4.3) summary(log.fit) log.fit$null.deviance-log.fit$deviance [1] 64.91309 attributes(summary(log.fit)) summary(log.fit)$coefficients attributes(log.fit) log.fit$fitted.values fitted(log.fit) # predict at given width value predict.glm(log.fit, type="response", newdata=data.frame(W=26.3)) # Poisson glm with identity link id.fit<-glm(Sa~W, family=poisson(link=identity),data=table.4.3, start=predict(log.fit, type="link")) summary(id.fit) # figure 4.5 plot(x=plot.x, y=plot.y, ylab="", xlab="Width (cm)",bty="L",axes=F, type="p", pch=16) axis(2, at=0:5) axis(1, at=seq(20,34,2)) guiCreate( "CommentDate", Name = "GSD2$1", Title = "Mean Number of Satellites, \\\"Symbol\"m", FillColor = "Transparent", FontSize="16") guiModify( "CommentDate", Name = "GSD2$1", xPosition = "0.542857", yPosition = "3.52967", Angle = "90") ind<-order(table.4.3$W) lines(x=table.4.3$W[ind],y=log.fit$fitted.values[ind]) lines(x=table.4.3$W[ind],y=id.fit$fitted.values[ind]) arrows(x1=23.5,y1=2.9,x2=23.5,y2=predict(log.fit,newdata=data.frame(W=23.5), type="response"), open=T, size=.3) text(x=23.5,y=3,"Log Link") arrows(x1=29.75,y1=3.1,x2=29.75,y2=predict(id.fit,newdata=data.frame(W=29.75), type="response"), open=T, size=.3) text(x=29.75,y=2.9,"Identity Link") # Compare AICs library(MASS) (extractAIC.glm(log.fit)[[2]]+2*log.fit$null.deviance)/2 [1] 571.8786 extractAIC(id.fit)[[2]] [1] 561.7163 ## Overdispersion in Poisson GLIMs # table 4.4 tapply(table.4.3$Sa, table.4.3$W.fac, function(x) c(mean=mean(x), variance=var(x))) # by(table.4.3$Sa, table.4.3$W.fac, function(x) c(mean=mean(x), variance=var(x))) # detecting overdispersion summary.glm(log.fit) summary.glm(log.fit, dispersion=0)$dispersion res<-summary.glm(log.fit) sqrt(diag(res$cov.unscaled)*summary.glm(log.fit, dispersion=0)$dispersion) summary.glm(log.fit, dispersion=0)$coefficients null.fit<-glm(Sa~1, family=poisson(link=log),data=table.4.3) anova.glm(null.fit, log.fit,test="F") (deviance(null.fit)-deviance(log.fit))/summary.glm(log.fit, dispersion=0)$dispersion ## Negative binomial GLIMs library(mass) glm(Sa~W, family=negative.binomial(theta=1,link="identity"),data=table.4.3,start=predict(log.fit, type="link")) # using neg.bin glm(Sa~W, family=neg.bin(theta=1),data=table.4.3) # using glm.nb nb.fit<-glm.nb(Sa ~ W, data = table.4.3, init.theta=1, link=identity, start=predict(log.fit, type="link")) summary(nb.fit) glm.convert(nb.fit) ## Residuals resid(log.fit, type="deviance") pear.res<-resid(log.fit, type="pearson") pear.std<-resid(log.fit, type="pearson")/sqrt(1-lm.influence(log.fit)$hat) par(mfrow=c(2,2)) plot(pear.res, xlab="observation",ylab="Pearson Residuals") abline(h=0) plot(pear.std, xlab="observation",ylab="Standardized Pearson Residuals") abline(h=0) plot(log.fit, ask=T) ## Quasi-LH and GLIMs # horseshoe crab data (females) Sa<-tapply(table.4.3$Sa, table.4.3$W, sum) mu<-tapply(predict(log.fit, type="response"), table.4.3$W, sum) chi.squared<-sum(((Sa-mu)^2)/mu) [1] 174.2737 chi.squared/64 # teratology data table.4.5<-read.table("teratology.ssc", col.names=c("","group","litter.size","num.dead"))[,-1] table.4.5$group<-as.factor(table.4.5$group) fit1<-glm(num.dead/litter.size~group-1, weights=litter.size, data=table.4.5, family=binomial) pred<-unique(round(predict(fit1, type="response"),3)) [1] 0.758 0.102 0.034 0.048 (SE<-sqrt(pred*(1-pred)/tapply(table.4.5$litter.size,table.4.5$group,sum))) 1 2 3 4 0.02368473 0.02786104 0.02379655 0.0209615 table.4.5$litter.size*predict(fit1, type="response") (chi.squared<-sum(resid(fit1, type="pearson")^2)) [1] 154.7069 SE*sqrt(chi.squared/54) 1 2 3 4 0.04008367 0.04715159 0.04027292 0.03547493 # quasi family glm(num.dead/litter.size~group-1, weights=litter.size, data=table.4.5, family=quasi(variance="mu(1-mu)"), start=predict(fit1, type="response")) Coefficients: group1 group2 group3 group4 0.7584098 0.1016949 0.03448276 0.04807692 Degrees of Freedom: 58 Total; 54 Residual Residual Deviance: 173.4532 fit3<-glm(num.dead/litter.size~group, weights=litter.size, data=table.4.5, family=quasi(variance="mu(1-mu)"), start=predict(fit1, type="response")) (pred<-unique(round(predict(fit3, type="response"),3))) ## GAMs for binary response # figure 4.7 table.4.3$Sa.bin<-ifelse(table.4.3$Sa>0,1,0) plot.table.4.3<-aggregate(table.4.3$Sa, by=list(Sa.bin=table.4.3$Sa.bin,W=table.4.3$W), length) plot.table.4.3 <- convert.col.type(target = plot.table.4.3, column.spec = list("Sa.bin"), column.type = "double") plot.table.4.3 <- convert.col.type(target = plot.table.4.3, column.spec = list("W"), column.type = "double") par(pty="s") plot(y=table.4.3$Sa.bin,x=table.4.3$W,xlab="Width, x (cm)", ylab="Probability of presence of satellites", axes=F, type="n") axis(2, at=c(0,1)) axis(1, at=seq(20,34,2)) text(y=plot.table.4.3$Sa.bin,x=plot.table.4.3$W,labels=plot.table.4.3$x, cex=.5) guiModify( "XAxisTitle", Name = "GSD2$1$Axis2dX1$XAxisTitle", Title = "Width, `x` (cm)") res<-gam(Sa.bin~s(W, df=3), family=binomial(link=logit), x=T, data=plot.table.4.3) lines(x=plot.table.4.3$W,y=res$fitted.values) prop<-aggregate(table.4.3$Sa.bin, by=table.4.3$W.fac, mean)$x lines(plot.x, prop, type="p",pch=16) # see above for defn of plot.x plot.gam(res, se=T) ############# Chap. 5 - Agresti ################### ## logistic regression for horseshoe crab data # table.4.3$Sa.bin<-ifelse(table.4.3$Sa>0,1,0) (crab.fit.logit<-glm(Sa.bin~W, family=binomial, data=table.4.3)) Call: glm(formula = Sa.bin ~ W, family = binomial, data = table.4.3) Coefficients: (Intercept) W -12.35082 0.4972305 Degrees of Freedom: 173 Total; 171 Residual Residual Deviance: 194.4527 # Recall: # table.4.3$W.fac<-cut(table.4.3$W, breaks=c(0,seq(23.25, 29.25),Inf)) # prop<-aggregate(table.4.3$Sa.bin, by=table.4.3$W.fac, mean)$x # plot.x<-aggregate(table.4.3$W, by=list(W=table.4.3$W.fac), mean)$x graphsheet() par(pty="s") plot(y=table.4.3$Sa.bin,x=table.4.3$W,xlab="", ylab="", axes=F, type="n") axis(2, at=seq(0,1,.2)) axis(1, at=seq(20,34,2)) guiModify( "XAxisTitle", Name = "GSD2$1$Axis2dX1$XAxisTitle", Title = "Width, `x` (cm)") guiModify( "YAxisTitle", Name = "GSD2$1$Axis2dY1$YAxisTitle", Title = "Proportion having satellites, \\\"Symbol\"p(\\\"Arial\"x\\\"Symbol\")") lines(y=prop, x=plot.x, pch=16, type="p") ind<-order(table.4.3$W) lines(x=table.4.3$W[ind],y=predict(crab.fit.logit, type="response")[ind], type="l", lty=3) # Table 5.2 # Table with successes and failures per width (66 different widths) cont.table<-crosstabs(~W+Sa.bin, data=table.4.3, margin=list(),drop.unused.levels=F) # R Bertolusso: # cont.table <- tapply(table.4.3$Sa.bin,table.4.3[,c("W","Sa.bin")], length) # cont.table <- ifelse(is.na(cont.table), 0, cont.table) # Extract widths from dim names w.unique <-as.numeric(attr(cont.table,"dimnames")$W) # matrix of successes and failures matrix.succ.fail<-structure(.Data=cont.table,dim=c(66,2))[,2:1] # R Bertolusso: # matrix.succ.fail <- cbind(cont.table[,"1"], cont.table[,"0"]) # First 2 columns of table 5.2 w.cut <- cut(w.unique, breaks=c(0,seq(23.25, 29.25),Inf), left.include=T) observed<-apply(matrix.succ.fail,2,aggregate,by=list(W=w.cut),sum) (observed <- matrix(c(observed[[1]][,ncol(observed.yes)],observed[[2]][,ncol(observed.no)]), ncol = 2)) # R Bertolusso: #observed.yes <- aggregate(matrix.succ.fail[,1], by=list(W=w.cut), sum) #observed.no <- aggregate(matrix.succ.fail[,2], by=list(W=w.cut), sum) #(observed <- matrix(c(observed.yes[,ncol(observed.yes)],observed.no[,ncol(observed.no)]), ncol = 2)) # Last 2 columns # Each fitted probability (and 1 - fitted probability) # is multiplied by the total of observations for that width # and totaled by group. fit.1ogit <- glm(matrix.succ.fail~w.unique, family=binomial) fitted.yes <- aggregate(predict(fit.1ogit, type="response") * apply(matrix.succ.fail,1,sum), by=list(W=w.cut), sum) fitted.no <- aggregate((1-predict(fit.1ogit, type="response")) * apply(matrix.succ.fail,1,sum), by=list(W=w.cut), sum) fitted.all <- matrix(c(fitted.yes$x,fitted.no$x), ncol = 2) #Chi-square test (x.squared = sum((observed-fitted.all)^2/fitted.all)) # df = #binomial samples - #coefficients df <- length(observed[,1]) - length(fit.logit$coefficients) 1-pchisq(x.squared, df) # G2 # R. Bertolusso (take medians of categories) W.fac<-cut(table.4.3$W, breaks=c(0,seq(23.25, 29.25),Inf),left.include=T) glm(observed~aggregate(table.4.3$W, by=list(W=W.fac), median)$x, family=binomial)$deviance # Agresti (midpoint of category spread) glm(observed~seq(22.75,29.75), family=binomial)$deviance # Inference summary.glm(crab.fit.logit, correlation=F) crab.fit.logit$null.deviance-crab.fit.logit$deviance [1] 31.30586 # Profile likelihood ratio interval library(mass) confint(crab.fit.logit) see R script # score ci for x=26.5 res<-prop.test(x=4,n=6,conf.level=0.95,correct=F) res$conf.int # plot of estimate of p(x) and pointwise confidence interval crab.predict<-predict(crab.fit.logit, type="response", se=T) ind<-order(table.4.3$W) plot(table.4.3$W[ind],crab.predict$fit[ind], axes=F, type="l", xlim=c(20,33),ylab="Probability of satellite", xlab="") axis(2, at=seq(0,1,.2)) axis(1, at=seq(20,32,2)) guiModify( "XAxisTitle", Name = "GSD2$1$Axis2dX1$XAxisTitle", Title = "Width, `x` (cm)") lines(table.4.3$W[ind],crab.predict$fit[ind]-1.96*crab.predict$se[ind],lty=3) lines(table.4.3$W[ind],crab.predict$fit[ind]+1.96*crab.predict$se[ind],lty=3) # Hosmer-Lemeshow GOF test table.4.3$prob.group<-cut(crab.predict$fit,breaks=quantile(x=crab.predict$fit,seq(0,1,.1)),include.lowest=T ) #table.4.3$prob.group<-cut(crab.predict$fit,breaks=10) #table.4.3$prob.group<-cut(order(crab.predict$fit), breaks=seq(0,173,17.3), include.lowest=T) table.4.3$predict<-crab.predict$fit Hosmer.GOF<-sum(unlist(by(table.4.3, table.4.3$prob.group, function(x){ p<-sum(x$predict) ((sum(x$Sa.bin)-p)^2)/(p*(1-p/nrow(x))) }))) 1-pchisq(Hosmer.GOF,8) # Logit models for categorical data # Table 5.3 Alcohol<-factor(c("0","<1","1-2","3-5",">=6"), levels=c("0","<1","1-2","3-5",">=6")) malformed<-c(48,38,5,1,1) n<-c(17066,14464,788,126,37)+malformed options(contrasts=c("contr.treatment", "contr.poly")) Table.5.3.logit<-glm(malformed/n~Alcohol,family=binomial(link=logit), weights=n) # saturated model revAlcohol<-factor(c("0","<1","1-2","3-5",">=6"), levels=rev(c("0","<1","1-2","3-5",">=6"))) Table.5.3.logit2<-glm(malformed/n~revAlcohol,family=binomial(link=logit), weights=n) # saturated model cbind(logit=predict(Table.5.3.logit), fitted.prop=predict(Table.5.3.logit,type="response")) logit fitted.prop 1 -5.873642 0.002804721 2 -5.941832 0.002620328 3 -5.060060 0.006305170 4 -4.836282 0.007874016 5 -3.610918 0.026315789 cbind(logit=predict(Table.5.3.logit2), fitted.prop=predict(Table.5.3.logit2,type="response")) logit fitted.prop 1 -5.873642 0.002804721 2 -5.941832 0.002620328 3 -5.060060 0.006305170 4 -4.836282 0.007874016 5 -3.610918 0.026315789 (Table.5.3.logit3<-glm(malformed/n~1,family=binomial(link=logit), weights=n)) # independence model # LR statistic summary(Table.5.3.logit3)$deviance [1] 6.201998 # Pearson chi-squared statistic sum(residuals(Table.5.3.logit3, type="pearson")^2) [1] 12.08205 1-pchisq(12.08205, df=4) # Linear logit model scores<-c(0,.5,1.5,4,7) Table.5.3.LL<-glm(malformed/n~scores,family=binomial,weights=n) summary(Table.5.3.LL) # chi-squared statistic sum(residuals(Table.5.3.LL, type="pearson")^2) [1] 2.050051 # lrt Table.5.3.LL$null.deviance-Table.5.3.LL$deviance # profile ci's library(mass) confint(Table.5.3.LL) # estimated probabilities cbind(logit=predict(Table.5.3.LL), fitted.prop=predict(Table.5.3.LL,type="response")) # Cochran-Armitage Trend test x<-c(rep(scores,malformed),rep(scores,n-malformed)) y<-c(rep(1,sum(malformed)),rep(0,sum(n-malformed))) (z2<-32574*cor(x,y)^2) 1-pchisq(z2, df=1) # Using an ordered factor: AlcoholO<-as.ordered(Alcohol) res<-glm(malformed/n~AlcoholO,family=binomial,weights=n) summary(res)$coefficients Value Std. Error t value (Intercept) -5.0645469 0.3022257 -16.7574989 AlcoholO.L 1.1522319 0.6647804 1.7332520 AlcoholO.Q 0.4557700 0.7769480 0.5866158 AlcoholO.C 1.2732177 0.4425533 2.8769814 AlcoholO ^ 4 0.6580112 0.7650966 0.8600368 # the quartic effect might be spurious ## Multiple logit model # AIDS data table.5.5<-expand.grid(AZT=factor(c("Yes","No"),levels=c("No","Yes")), Race=factor(c("White","Black"),levels=c("Black","White"))) table.5.5<-data.frame(table.5.5,Yes=c(14,32,11,12), No=c(93,81,52,43)) # Fit logit model # set contrasts to treatment options(contrasts=c("contr.treatment","contr.poly")) fit<-glm(cbind(Yes,No) ~ AZT + Race , family=binomial, data=table.5.5) summary(fit) confint(fit) # lrt of conditional independence of race and AIDS given AZT fit2<-update(object=fit, formula=~ . -Race) anova(fit, fit2, test="Chisq") res<-predict(fit, type="response", se=T) pointwise.normal<-function(results.predict, coverage = 0.99) { fit <- results.predict$fit limits <- qnorm(1. - (1. - coverage)/2.) * results.predict$se.fit list(upper = fit + limits, fit = fit, lower = fit - limits) } AIDS.bars<-pointwise.normal(res, coverage=.95) error.bar(c(2,2,1,1), y=AIDS.bars$fit, AIDS.bars$lower, AIDS.bars$upper,incr=F, gap=F, xlim=c(0,3),ylim=c(0,.35),axes=F,xlab="Race",ylab="Probability of AIDS (95% CI)",pch=".") axis(1,at=c(2,1),labels=c("White","Black")) axis(2,at=c(0,.1,.2,.3)) lines(c(1,2),AIDS.bars$fit[c(3,1)]) lines(c(1,2),AIDS.bars$fit[c(4,2)]) attach(table.5.5) propAIDS<-Yes/(Yes+No) points(c(2,2,1,1),propAIDS,pch=16) detach(2) # GOF fit$deviance sum(residuals(fit, type="pearson")^2) # lrt of conditional independence of race and AIDS given AZT fit2<-update(object=fit, formula=~ . -Race) (fit2$deviance-fit$deviance) [1] 0.03708355 ## Multiple logistic regression options(contrasts=c("contr.treatment", "contr.poly")) table.4.3$C.fac<-factor(table.4.3$C, levels=c("5","4","3","2"), labels=c("dark","med-dark","med","med-light")) crab.fit.logist<-glm(Sa.bin~C.fac+W, family=binomial,data=table.4.3) summary(crab.fit.logist,cor=F) # profile LH CI library(mass) confint(crab.fit.logist) # figure 5.5 res1<-predict(crab.fit.logist, type="response", newdata=data.frame(W=seq(18,34,1),C.fac="med-light")) res2<-predict(crab.fit.logist, type="response", newdata=data.frame(W=seq(18,34,1),C.fac="med")) res3<-predict(crab.fit.logist, type="response", newdata=data.frame(W=seq(18,34,1),C.fac="med-dark")) res4<-predict(crab.fit.logist, type="response", newdata=data.frame(W=seq(18,34,1),C.fac="dark")) plot(seq(18,34,1),res1,type="l",bty="L",ylab="Predicted Probability", axes=F) axis(2, at=seq(0,1,.2)) axis(1, at=seq(18,34,2)) guiModify( "XAxisTitle", Name = "GSD2$1$Axis2dX1$XAxisTitle", Title = "Width, `x` (cm)") lines(seq(18,34,1),res2) lines(seq(18,34,1),res3) lines(seq(18,34,1),res4) arrows(x1=29, y1=res1[25-17],x2=25, y2=res1[25-17],size=.25,open=T) text(x=29.1, y=res1[25-17], "Color 1", adj=0) arrows(x1=23, y1=res2[26-17],x2=26, y2=res2[26-17], size=.25,open=T) text(x=21.1, y=res2[26-17], "Color 2", adj=0) arrows(x1=28.9, y1=res3[24-17],x2=24, y2=res3[24-17], size=.25,open=T) text(x=29, y=res3[24-17], "Color 3", adj=0) arrows(x1=25.9, y1=res4[23-17],x2=23, y2=res4[23-17], size=.25,open=T) text(x=26, y=res4[23-17], "Color 4", adj=0) # lrt of conditional independence crab.fit.logist.ia<-update(object=crab.fit.logist, formula=~ . + W:C.fac) anova(crab.fit.logist, crab.fit.logist.ia, test="Chisq") # quantitative ordinal crab.fit.logist.ord<-glm(Sa.bin~C+W, family=binomial, data=table.4.3) summary(crab.fit.logist.ord, cor=F) # lrt comparing scores model to qualtitative model anova(crab.fit.logist.ord,crab.fit.logist, test="Chisq") # binary scores model table.4.3$C.bin<-ifelse(table.4.3$C<5,1,0) glm(Sa.bin~C.bin+W, family=binomial, data=table.4.3) ## Example problem 5.17 duration<-c(45,15,40,83,90,25,35,65,95,35,75,45,50,75,30,25,20,60,70,30,60,61,65,15,20,45,15,25,15,30,40,15,135,20,40) type<-c(0,0,0,1,1,1,rep(0,5),1,1,1,0,0,1,1,1,rep(0,4),1,1,0,1,0,1,0,0,rep(1,4)) sore<-c(0,0,rep(1,10),0,1,0,1,0,rep(1,4),0,1,0,0,1,0,1,0,1,1,0,1,0,0) sore.fr<-cbind(duration, type, sore) options(contrasts=c("contr.treatment","contr.poly")) sorethroat.lg<-glm(sore ~ type*duration, family=binomial) summary(sorethroat.lg) sorethroat.lg<-glm(sore ~ type*scale(duration), family=binomial) summary(sorethroat.lg) # LRT of interaction sorethroat.lg2<-glm(sore ~ type + scale(duration), family=binomial) anova(sorethroat.lg2, sorethroat.lg, test="Chisq") # plot plot(c(15,135),c(0,1), type="n", xlab="duration",ylab="prob") text(duration,sore,as.character(ifelse(type,"T","L"))) lines(15:135,predict.glm(sorethroat.lg,data.frame(duration=15:135,type=1),type="response")) lines(15:135,predict.glm(sorethroat.lg,data.frame(duration=15:135,type=0),type="response"), lty=2) key(x=100, y=.6, text=list(c("Tracheal","Laryngeal")), lines=list(lty=1:2, size=2), border=T) # test for type difference at 1 hour sorethroat.lgA<-glm(sore ~ type*I(scale(duration)-.502), family=binomial) summary(sorethroat.lgA) # test for parallel lines across type type.fac<-factor(ifelse(type,"trach","laryn"),levels=c("trach","laryn")) sorethroat.lgB<-update(sorethroat.lg, . ~ type.fac + scale(duration) -1) summary(sorethroat.lgB)$coefficients library(mass) dose.p(sorethroat.lgB,c(1,3),(1:3)/4) dose.p(sorethroat.lgB,c(2,3),(1:3)/4) ############# Chap. 6 - Agresti ################### ## backward elimination for horseshoe crab options(contrasts=c("contr.treatment", "contr.poly")) table.4.3$C.fac<-factor(table.4.3$C, levels=c("5","4","3","2")) table.4.3$S.fac<-factor(table.4.3$S, levels=c("3","2","1")) crab.fit.logist.full<-glm(Sa.bin~C.fac+S.fac+W+I(Wt/1000), family=binomial,data=table.4.3) summary(crab.fit.logist.full) Coefficients: Value Std. Error t value (Intercept) -9.2733819 3.8364636 -2.4171693 C.fac4 1.1198011 0.5931762 1.8878052 C.fac3 1.5057627 0.5665525 2.6577638 C.fac2 1.6086660 0.9353686 1.7198203 S.fac2 -0.4962679 0.6290766 -0.7888830 S.fac1 -0.4002868 0.5025636 -0.7964899 W 0.2631276 0.1952484 1.3476557 I(Wt/1000) 0.8257794 0.7036640 1.1735422 (Dispersion Parameter for Binomial family taken to be 1 ) Null Deviance: 225.7585 on 172 degrees of freedom Residual Deviance: 185.202 on 165 degrees of freedom Number of Fisher Scoring Iterations: 4 Correlation of Coefficients: (Intercept) C.fac4 C.fac3 C.fac2 S.fac2 S.fac1 W C.fac4 -0.1318818 C.fac3 -0.0670015 0.7233703 C.fac2 -0.0043035 0.4499020 0.5507148 S.fac2 -0.2184819 -0.0733221 -0.1685117 -0.2471148 S.fac1 -0.0120010 -0.0327826 -0.2074473 -0.3672790 0.2431179 W -0.9649203 0.0241011 -0.0308300 -0.0336341 0.1922667 0.0161518 I(Wt/1000) 0.6740016 -0.0097672 -0.0014684 -0.0365701 -0.0891985 -0.0402631 -0.8308544 # LRT crab.fit.logist.full$null.deviance-crab.fit.logist.full$deviance # backward elimination options(contrasts=c("contr.treatment", "contr.poly")) table.4.3$C.fac<-factor(table.4.3$C, levels=c("5","4","3","2")) table.4.3$S.fac<-factor(table.4.3$S, levels=c("3","2","1")) crab.fit.logist.stuffed<-glm(Sa.bin~C.fac*S.fac*W, family=binomial,data=table.4.3) res<-step.glm(crab.fit.logist.stuffed, list(lower = ~1, upper = formula(crab.fit.logist.stuffed) ), scale=1, trace=T, direction="backward") res$anova library(mass) res<-stepAIC(crab.fit.logist.stuffed, list(lower = ~1, upper = formula(crab.fit.logist.stuffed) ), scale=1, trace=T, direction="backward") res$anova ## Fitting causal logit models table.6.3<-expand.grid(list(M=c("divorced","married"),E=c("yes","no"), P=c("yes","no"), G=c("Female","Male"))) count<-c(17,4,54,25,36,4,214,322,28,11,60,42,17,4,68,130) table.6.3.expand<-table.6.3[rep(1:(length(count)),count),] options(contrasts=c("contr.treatment","contr.poly")) EMS.10<-glm(P ~ 1, family=binomial, data=table.6.3.expand) EMS.11<-update(EMS.10, .~. +G) anova(EMS.10, EMS.11) Analysis of Deviance Table Response: P Terms Resid. Df Resid. Dev Test Df Deviance 1 1 1035 1123.914 2 G 1034 1048.654 1 75.2594 EMS.20<-glm(E ~ 1, family=binomial, data=table.6.3.expand) EMS.21<-update(EMS.20, .~. +P) EMS.22<-update(EMS.21, .~. +G) anova(EMS.20, EMS.21, EMS.22) Analysis of Deviance Table Response: E Terms Resid. Df Resid. Dev Test Df Deviance 1 1 1035 746.9373 2 P 1034 700.9209 1 46.01636 3 P + G 1033 698.0138 +G 1 2.90718 EMS.31<-glm(M ~ E+P, family=binomial, data=table.6.3.expand) EMS.32<-update(EMS.31, .~. +E:P) EMS.33<-update(EMS.32, .~. +G) anova(EMS.31, EMS.32, EMS.33) Analysis of Deviance Table Response: M Terms Resid. Df Resid. Dev Test Df Deviance 1 E + P 1033 1344.180 2 E + P + E:P 1032 1331.266 +E:P 1 12.91404 3 E + P + G + E:P 1031 1326.718 +G 1 4.54768 ## Logistic regression diagnostics # Residuals # Table 6.5 BP<-factor(c("<117","117-126","127-136","137-146","147-156","157-166","167-186",">186")) CHD<-c(3,17,12,16,12,8,16,8) n<-c(156,252,284,271,139,85,99,43) resCHD<-glm(CHD/n~1,family=binomial, weights=n) resCHD$deviance [1] 30.02257 pred.indep<-n*predict(resCHD, type="response") dev.indep<-resid(resCHD, type="deviance") pear.indep<-resid(resCHD, type="pearson") pear.std.indep<-resid(resCHD, type="pearson")/sqrt(1-lm.influence(resCHD)$hat) structure(cbind(pred.indep, dev.indep, pear.indep, pear.std.indep), dimnames=list(BP,c("fitted","deviance resid","pearson resid", "pearson std resid"))) par(mfrow=c(2,2)) plot(pear.std.indep, xlab="",ylab="Standardized Pearson Residuals", pch=16, axes=F) axis(1,at=1:8, labels=as.character(BP), srt=90) axis(2) abline(h=0) out<-pear.std.indep>3 par(mfrow=c(2,2)) plot(pear.std.indep, xlab="",ylab="Standardized Pearson Residuals", axes=F, type="n") text((1:8)[out], pear.std.indep[out], ">3") points((1:8)[!out], pear.std.indep[!out], pch=16) axis(1,at=1:8, labels=as.character(BP), srt=90) axis(2) abline(h=0) # Linear logit model scores<-c(seq(from=111.5,to=161.5,by=10),176.5,191.5) resLL<-glm(CHD/n~scores,family=binomial,weights=n) resLL$deviance [1] 5.909158 pred.indep<-n*predict(resLL, type="response") dev.indep<-resid(resLL, type="deviance") pear.indep<-resid(resLL, type="pearson") pear.std.indep<-resid(resLL, type="pearson")/sqrt(1-lm.influence(resLL)$hat) structure(cbind(pred.indep, dev.indep, pear.indep, pear.std.indep), dimnames=list(as.character(scores),c("fitted","deviance resid","pearson resid", "pearson std resid"))) # figure 6.2 plot(scores,CHD/n,pch="X",yaxt="n",xaxt="n",ylim=c(0,.2),xlab="Blood Pressure Level",ylab="Proportion",bty="L") axis(side=1,ticks=T,at=seq(from=110,to=200,by=10)) axis(side=2,ticks=T,at=seq(from=0,to=.2,by=.05)) lines(scores,predict(resLL, type="response"),type="l") # Table 6.7 yes<-c(32,21,6,3,12,34,3,4,52,5,8,6,35,30,9,11,6,15,17,4,9,21,26,25,21,7,25,31,3,9,10,25,25,39,2,4,3,0,29,6,16,7,23,36,4,10) no<-c(81,41,0,8,43,110,1,0,149,10,7,12,100,112,1,11,3,6,0,1,9,19,7,16,10,8,18,37,0,6,11,53,34,49,123,41,3,2,13,3,33,17,9,14,62,54) table.6.7<-cbind(expand.grid(gender=c("female","male"),dept=c("anth","astr","chem","clas","comm","comp","engl","geog","geol","germ","hist","lati","ling","math","phil","phys","poli","psyc","reli","roma","soci","stat","zool")),prop=yes/(yes+no)) res.gradadmit<-glm(prop~dept, family=binomial, data=table.6.7, weights=yes+no) res.gradadmit$deviance sum(resid(res.gradadmit, type="pearson")^2) resid(res.gradadmit, type="pearson")/sqrt(1-lm.influence(res.gradadmit)$hat) # influence diagnostics sign(resLL$coefficients[2]-lm.influence(resLL)$coefficients[,2,drop=F])*sqrt(Cook.terms(resLL)[,2]) scores 1 0.492099277 2 -1.142149031 3 0.327960528 4 0.081380326 5 0.007858333 6 -0.065196274 7 0.400949178 8 -0.123737455 # Predictive power library(Hmisc, first=T) library(Design,first=T) #BP<-factor(c("<117","117-126","127-136","137-146","147-156","157-166","167-186",">186")) #scores<-c(seq(from=111.5,to=161.5,by=10),176.5,191.5) #CHD<-c(3,17,12,16,12,8,16,8) #n<-c(156,252,284,271,139,85,99,43) res<-numeric(2*length(CHD)) res[rep(c(T,F),length(CHD))]<-CHD res[rep(c(F,T),length(CHD))]<-n-CHD table.6.5<-data.frame(CHD=rep(rep(c(1,0),length(CHD)),res), BP=rep(BP,n), scores=rep(scores,n)) options(contrasts=c("contr.treatment", "contr.poly")) res.lrm<-lrm(CHD~scores,data=table.6.5, x=T, y=T) Logistic Regression Model lrm(formula = CHD ~ scores, data = table.6.5, x = T, y = T) Frequencies of Responses 0 1 1237 92 Obs Max Deriv Model L.R. d.f. P C Dxy Gamma Tau-a R2 Brier 1329 2e-005 24.11 1 0 0.636 0.273 0.316 0.035 0.045 0.063 Coef S.E. Wald Z P Intercept -6.08203 0.724320 -8.40 0 scores 0.02434 0.004843 5.03 0 (1-exp((resLL$deviance-resLL$null.deviance)/8))/(1-exp(-resLL$null.deviance/8)) junk<-glm(formula = CHD ~ scores, family = binomial, data=table.6.5) (1-exp((junk$deviance-junk$null.deviance)/1329))/(1-exp(-junk$null.deviance/1329)) Rsquared.glm(junk) validate.lrm(res.lrm, method="boot", B=100) detach("Design") detach("Hmisc") ## Conditional independence tests table.6.9<-data.frame(scan(file="clinical trials table 69.ssc", what=list(Center="",Treatment="",Response="",Freq=0))) levels(table.6.9$Treatment)<-c("Drug","Control") levels(table.6.9$Response)<-c("Success","Failure") table.6.9.array<-design.table(table.6.9[,c(2,3,1,4)]) oddsratio.L<- function (x, stratum = NULL, Log = TRUE, conf.level = 0.95, correct=T) { l <- length(dim(x)) if (l > 2 && is.null(stratum)) stratum <- 3:l if (l - length(stratum) > 2) stop("All but 2 dimensions must be specified as strata.") if (l == 2 && dim(x) != c(2, 2)) stop("Not a 2 x 2 - table.") if (!is.null(stratum) && dim(x)[-stratum] != c(2, 2)) stop("Need strata of 2 x 2 - tables.") lor <- function(y, correct, Log) { if(correct) y<-y + 0.5 or <- y[1, 1] * y[2, 2]/y[1, 2]/y[2, 1] if (Log) log(or) else or } ase <- function(y,correct) sqrt(sum(1/(ifelse(correct,y + 0.5,y)))) if (is.null(stratum)) { LOR <- lor(x, correct) ASE <- ase(x) } else { LOR <- apply(x, stratum, lor, correct=correct, Log=Log) ASE <- apply(x, stratum, ase, correct=correct) } I <- ASE * qnorm((1 + conf.level)/2) Z <- LOR/ASE structure(LOR, ASE = if (Log) ASE, lwr = if (Log) LOR - I else exp(log(LOR) - I), upr = if (Log) LOR + I else exp(log(LOR) + I), Z = if (Log) Z, P = if (Log) 1 - pnorm(abs(Z)), log = Log, class = "oddsratio") } oddsratio.L(table.6.9.array, correct=F, Log=F) # CMH Test mantelhaen.test(table.6.9.array) # logit model test n<-aggregate(table.6.9$Freq, list(table.6.9$Treatment,table.6.9$Center), FUN=sum)$x table.6.9$n<-rep(n,rep(2,16)) options(contrasts=c("contr.treatment", "contr.poly")) res<-glm(Freq/n~Center+Treatment, family=binomial, data=table.6.9, weights=n, subset=Response=="Success") summary(res) Coefficients: Value Std. Error t value (Intercept) -0.5450944 0.2929052 -1.8609927 Centerb 2.0554344 0.4200885 4.8928604 Centerc 1.1529027 0.4245668 2.7154802 Centerd -1.4184537 0.6635739 -2.1375972 Centere -0.5198903 0.5337883 -0.9739635 Centerf -2.1469176 1.0603341 -2.0247558 Centerg -0.7977076 0.8149166 -0.9788824 Centerh 2.2079143 0.7195076 3.0686463 Treatment -0.7769203 0.3066807 -2.5333195 (Dispersion Parameter for Binomial family taken to be 1 ) Null Deviance: 93.5545 on 15 degrees of freedom Residual Deviance: 9.746317 on 7 degrees of freedom anova(res, test="Chisq") Analysis of Deviance Table Binomial model Response: Freq/n Terms added sequentially (first to last) Df Deviance Resid. Df Resid. Dev Pr(Chi) NULL 15 93.55450 Center 7 77.13937 8 16.41513 0.000000000 Treatment 1 6.66882 7 9.74632 0.009811426 # heterogeneity of conditional ORs res2<-update(res, .~.-Treatment) res3<-update(res, .~.+Center:Treatment) anova(res2,res3,test="Chisq") Analysis of Deviance Table Response: Freq/n Terms Resid. Df Resid. Dev Test Df Deviance Pr(Chi) 1 Center 8 16.41513 2 Center + Treatment + Center:Treatment 0 0.00060 +Treatment+Center:Treatment 8 16.41453 0.03681686 # estimate common odds ratio exp(-res$coefficient[9]) # test homogeneity of odds ratios res$deviance woolf.test<-function (x) { DNAME <- deparse(substitute(x)) x <- x + 1/2 k <- dim(x)[3] or <- apply(x, 3, function(x) (x[1, 1] * x[2, 2])/(x[1, 2] * x[2, 1])) w <- apply(x, 3, function(x) 1/sum(1/x)) o <- log(or) e <- weighted.mean(log(or), w) STATISTIC <- sum(w * (o - e)^2) PARAMETER <- k - 1 PVAL <- 1 - pchisq(STATISTIC, PARAMETER) METHOD <- "Woolf-test on Homogeneity of Odds Ratios (no 3-Way assoc.)" names(STATISTIC) <- "X-squared" names(PARAMETER) <- "df" structure(list(statistic = STATISTIC, parameter = PARAMETER, p.value = PVAL, method = METHOD, data.name = DNAME, observed = o, expected = e), class = "htest") } woolf.test(table.6.9.array) Woolf-test on Homogeneity of Odds Ratios (no 3-Way assoc.) data: table.6.9.array X-squared = 5.818, df = 7, p-value = 0.5612 ## models to improve inferential power table.6.11<-data.frame(change=factor(c("worse","stationary","slight improvement", "moderate improvement", "marked improvement"), levels=c("worse","stationary","slight improvement", "moderate improvement", "marked improvement")), high=c(1,13,16,15,7), n=c(12, 66, 58, 42, 18)) res.leprosy<-glm(high/n~change, weights=n, family=binomial, data=table.6.11) anova(res.leprosy, test="Chisq") Analysis of Deviance Table Binomial model Response: high/n Terms added sequentially (first to last) Df Deviance Resid. Df Resid. Dev Pr(Chi) NULL 4 7.277707 change 4 7.277707 0 0.000000 0.1219205 resLL.leprosy<-glm(high/n~c(-1,0,1,2,3), weights=n, family=binomial,data=table.6.11) anova(resLL.leprosy, test="Chisq") Analysis of Deviance Table Binomial model Response: high/n Terms added sequentially (first to last) Df Deviance Resid. Df Resid. Dev Pr(Chi) NULL 4 7.277707 c(-1, 0, 1, 2, 3) 1 6.651426 3 0.626282 0.009907651 anova(resLL.leprosy, res.leprosy, test="Chisq") ## power and sample size binomial.sample.size(n1=20, n2=25, p=.6, p2=.7, alpha=.05,alternative="two.sided") p1 p2 delta alpha power n1 n2 prop.n2 1 0.6 0.7 0.1 0.05 0.05610319 20 25 1.25 bpower(n1=20,n2=25,p1=.6,p2=.7) Power 0.1084922 binomial.sample.size(n1=25, n2=25, p=.6, p2=.7, alpha=.05, alternative="two.sided") p1 p2 delta alpha power n1 n2 prop.n2 1 0.6 0.7 0.1 0.05 0.06137109 25 25 1 bpower(n=50, p1=.6, p2=.7, alpha=0.05) Power 0.1135017 binomial.sample.size(n1=100, n2=100, p=.6, p2=.7, alpha=.05,alternative="two.sided") bpower(n=200, p1=.6, p2=.7, alpha=0.05) Power 0.3158429 # sample size binomial.sample.size(p=.6, p2=.7, alpha=.05,alternative="two.sided", power=.9) p1 p2 delta alpha power n1 n2 prop.n2 1 0.6 0.7 0.1 0.05 0.9 497 497 1 bsamsize(power=.9, p1=.6, p2=.7, alpha=0.05) n1 n2 476.0072 476.0072 # power for chi-square test chipower.f<-function(p=NULL,pm=NULL,n,alpha,df, pearson=T, fit=NULL) { if(pearson) nc<-n*.pearson.x2(observed=p,expected=pm)$X2 else nc<-n*fit$deviance 1-pchisq(qchisq(1-alpha,df),df=df,ncp=nc) } # example 1 table.6.13<-data.frame(expand.grid(New = c("very worry", "somewhat", "reassuring"), Standard=c("worry", "reassure")), prop=c(.04*.4, .08*.32,.04*.27,.02*.3,.18*.22,.64*.15)) fit<-glm(prop~New+Standard, data=table.6.13, family="binomial") table.6.13.array<-design.table(table.6.13) # pearson chipower.f(p=table.6.13$prop, pm=fitted(fit), n=400, alpha=.05, df=2) # LR chipower.f(fit=fit, n=400, alpha=.05, df=2, pearson=F) # example 2 table.2.5<-data.frame(expand.grid(Smoker=c("yes","no"), Lung.Cancer=c("Cases","Controls")),count=c(688,21,650,59)) expected.f<-function (x, frequency = c("absolute", "relative")) { if (!is.array(x)) stop("Need array of absolute frequencies!") frequency <- match.arg(frequency) n <- sum(x) x <- x/n d <- length(dim(x)) tab <- apply(x, 1, sum) for (i in 2:d) tab <- tab %o% apply(x, i, sum) if (frequency == "relative") tab else tab * n } table.2.5.array<-design.table(table.2.5) fit<-expected.f(table.2.5.array) chipower.f(p=table.2.5.array/1418, pm=fit/1418, n=sum(table.2.5.array),alpha=.05,df=1) ## Probit and complementary log-log # Table 6.14 table.6.14<-data.frame(log.dose=c(1.691,1.724,1.755,1.784,1.811,1.837,1.861,1.884), n=c(59,60,62,56,63,59,62,60), y=c(6,13,18,28,52,53,61,60)) # probit options(contrasts=c("contr.treatment", "contr.poly")) res.probit<-glm(y/n~log.dose,weights=n,family=binomial(link=probit), data=table.6.14) n<-c(59,60,62,56,63,59,62,60) y<-rep(rep(c(1,0),8),c(6,59-6,13,60-13,18,62-18,28,56-28,52,63-52,53,59-53,61,62-61,60,0)) (res.probit2<-probitreg(x=rep(table.6.14$log.dose,n), y=y, start=res.probit$coefficients, hessian=T, method="BFGS")) summary(res.probit)$coefficients sqrt(diag(solve(res.probit2$hessian))) # cloglog (res.cloglog<-glm(y/n~log.dose,weights=n,family=binomial(link=cloglog), data=table.6.14)) data.frame(table.6.14, fitted.probit=round(n*fitted(res.probit),1), fitted.cloglog=round(n*fitted(res.cloglog),1)) plot(table.6.14$log.dose,table.6.14$y/table.6.14$n, pch=16, xlab="Log dosage", ylab="Proportion Killed", bty="L", axes=F) axis(1, at=seq(1.7, 1.9, .05)) axis(2, at=seq(0,1,.2)) lines(table.6.14$log.dose, fitted(res.probit), lty=2) lines(table.6.14$log.dose, fitted(res.cloglog), lty=1) key(x=1.8, y=.4, text=list(c("Complementary log-log", "Probit")), lines=list(type=c("l", "l"), lty=c(1,2)), border=F,text.width.multiplier=.8) ## conditional logistic regression malformed<-c(48,38,5,1,1) n<-c(17066,14464,788,126,37)+malformed temp<-length(10) temp[rep(c(T,F),5)]<-c(48,38,5,1,1) temp[rep(c(F,T),5)]<-c(17066,14464,788,126,37) table.5.3<-data.frame(Alcohol=rep(c(0,.5,1.5,4,7), n), case=rep(rep(c(1,0),5),temp)) fit<-coxph(Surv(rep(1,sum(temp)),case)~Alcohol, table.5.3, method="exact") (fit<-cond(glm(formula=case~Alcohol, family=binomial, data=table.5.3), offset=Alcohol)) summary(fit, test=T) # using cl.saddle attach(table.5.3) approx.points<-cl.saddle(y=case, formula=case~Alcohol, term="Alcohol",family=binomial, u=1,l=-1, B=20) plot(approx.points) 1-cpvalue.saddle(y=case, formula=case~Alcohol, term="Alcohol",family=binomial, H0=0,cc=F) detach(table.5.3) ## conditional inference for 2x2xK tables table.6.15<-data.frame(expand.grid(race=c("black","white"),promote=c("yes","no"),month=c("july","august","september")), count=c(0,4,7,16,0,4,7,13,0,2,8,13)) mantelhaen.test(design.table(table.6.15), correct=F) ## separation case (diarrhea) table.6.16<-expand.grid(Ceph=factor(c(0,1)), Age=factor(c(0,1)), Stay=factor(c(0,1)), case=factor(c(0,1))) table.6.16$count<-c(385,0,789-3,0,233-5,0,1081-47,0,0,0,3,0,5,0,47,5) fisher.test(design.table(table.6.16[table.6.16$Age==1 & table.6.16$Stay==1, c("Ceph","case","count")])) ############# Chap. 7 - Agresti ################### ## Nominal response models # Table 7.1 food.labs<-factor(c("fish","invert","rep","bird","other"),levels=c("fish","invert", "rep", "bird","other")) size.labs<-factor(c("<2.3",">2.3"),levels=c(">2.3","<2.3")) gender.labs<-factor(c("m","f"),levels=c("m","f")) lake.labs<-factor(c("hancock","oklawaha","trafford","george"),levels=c("george", "hancock", "oklawaha","trafford")) #food.labs<-c("fish","invert","rep","bird","other") #size.labs<-c("<2.3",">2.3") #gender.labs<-c("m","f") #lake.labs<-c("george","hancock","oklawaha","trafford") table.7.1<-expand.grid(food=food.labs,size=size.labs,gender=gender.labs,lake=lake.labs) temp<-c(7,1,0,0,5,4,0,0,1,2,16,3,2,2,3,3,0,1,2,3,2,2,0,0,1,13,7,6,0,0,3,9,1,0,2,0,1,0,1,0,3,7,1,0,1,8,6,6,3,5,2,4,1,1,4,0,1,0,0,0,13,10,0,2,2,9,0,0,1,2,3,9,1,0,1,8,1,0,0,1) table.7.1<-structure(.Data=table.7.1[rep(1:nrow(table.7.1),temp),], row.names=1:219) # fit models in Table 7.2 library(nnet) options(contrasts=c("contr.treatment","contr.poly")) fitS<-multinom(food~lake*size*gender,data=table.7.1) # saturated model fit0<-multinom(food~1,data=table.7.1) # null fit1<-multinom(food~gender,data=table.7.1) # G fit2<-multinom(food~size,data=table.7.1) # S fit3<-multinom(food~lake,data=table.7.1) # L fit4<-multinom(food~size+lake,data=table.7.1) # L + S fit5<-multinom(food~size+lake+gender,data=table.7.1) # L + S + G deviance(fit1)-deviance(fitS) # or fit1@deviance - fitS@deviance (S-PLUS only) deviance(fit2)-deviance(fitS) deviance(fit3)-deviance(fitS) deviance(fit4)-deviance(fitS) deviance(fit5)-deviance(fitS) deviance(fit0)-deviance(fitS) getSlots(fit5) # collapse over gender options(contrasts=c("contr.treatment","contr.poly")) fitS<-multinom(food~lake*size,data=table.7.1) # saturated model fit0<-multinom(food~1,data=table.7.1) # null fit1<-multinom(food~size,data=table.7.1) # S fit2<-multinom(food~lake,data=table.7.1) # L fit3<-multinom(food~size+lake,data=table.7.1) # L + S deviance(fit1)-deviance(fitS) # or fit1@deviance - fitS@deviance (S-PLUS only) deviance(fit2)-deviance(fitS) deviance(fit3)-deviance(fitS) deviance(fit0)-deviance(fitS) # Get fitted counts marg.counts <- tapply(table.7.1$food, list(factor(table.7.1$size, levels = c("<2.3", ">2.3")),factor(table.7.1$lake, levels =c("hancock", "oklawaha", "trafford", "george"))), length) row.names.71 <- rev(expand.grid(dimnames(marg.counts))) fitted.counts<-round(as.vector(marg.counts)*fit3@fitted[!duplicated(as.data.frame(fit3@fitted)),],1) structure(.Data=as.data.frame(fitted.counts), row.names=apply(row.names.71,1,paste,collapse=" ")) library(mass) summary(fit3, cor=F) # estimating response probabilities #options(contrasts=c("contr.treatment", "contr.poly")) predict(fit3, type="probs", newdata=data.frame(size=factor(">2.3",levels=c(">2.3","<2.3")), lake=factor("hancock",levels=c("george", "hancock", "oklawaha","trafford")))) cbind(expand.grid(size=size.labs, lake=lake.labs),predict(fit3, type="probs", newdata=expand.grid(size=size.labs, lake=lake.labs))) ## Proportional Odds Models # mental impairment data table.7.5<-read.table("mental.ssc",col.names=c("mentalC", "ses", "life")) table.7.5$mental<-ordered(table.7.5$mentalC, levels=1:4, labels=c("well","mild","moderate","impaired")) table.7.5$ses<- -table.7.5$ses table.7.5$life<- -table.7.5$life # polr in MASS library(mass) fit.polr<-polr(mental~ses+life, data=table.7.5) summary(fit.polr) # lrm in design library(hmisc) library(design) table.7.5$ses<- -table.7.5$ses table.7.5$life<- -table.7.5$life table.7.5$mental.rev<-ordered(table.7.5$mentalC, levels=4:1, labels=c("impaired","moderate","mild","well")) (fit.lrm<-lrm(mental.rev~ses+life, data=table.7.5)) # figure 7.6 plot(0:9,rowSums(predict(fit.polr, newdata=data.frame(ses=rep(-1,10), life=-c(0:9)), type="probs")[,3:4]), axes=F,lty=2, type="l",ylim=c(0,1),bty="L",ylab="P(Y > 2)", xlab="Life Events Index") axis(2) axis(1, at=0:9) lines(0:9,rowSums(predict(fit.polr, newdata=data.frame(ses=rep(0,10), life=-c(0:9)), type="probs")[,3:4]), lty=1) text(2.5, .5, "SES=0") arrows(2.5,.48,2.75,.42, open=T) text(3.5, .33, "SES=1") arrows(3.5,.31,3.75,.25, open=T) # score test see R script # interaction model (fit.lrm<-lrm(mental.rev~ses*life, data=table.7.5)) ## Adjacent-categories logits model (via baseline-categories logit model) table.7.8<-read.table("c:/program files/r/rw1070/cda/jobsat.r", col.names=c("gender","income","jobsat","freq")) table.7.8$jobsatf<-ordered(table.7.8$jobsat, labels=c("very diss","little sat","mod sat","very sat")) table.7.8$gender<-ifelse(table.7.8$jobsatf=="very diss",3*table.7.8$gender,table.7.8$gender) table.7.8$gender<-ifelse(table.7.8$jobsatf=="little sat",2*table.7.8$gender,table.7.8$gender) table.7.8$income<-ifelse(table.7.8$jobsatf=="very diss",3*table.7.8$income,table.7.8$income) table.7.8$income<-ifelse(table.7.8$jobsatf=="little sat",2*table.7.8$income,table.7.8$income) library(mass) polr(jobsatf~gender+income, weights=freq, data=table.7.8) table.7.8$jobsatf<-ordered(table.7.8$jobsat, labels=c("very sat","mod sat","little sat","very diss")) multinom(jobsatf~gender+income, weights=freq, data=table.7.8) ## Continuation-Ratio logit models # lrm from Design # Add the libraries in this order: library(Hmisc,T) library(Design,T) y<-ordered(c("non-live","malformed","normal"),levels=c("non-live","malformed","normal")) x<-c(0, 62.5, 125, 250, 500) table.7.9<-data.frame(expand.grid(y=y, x=x), freq=c(15,1,281,17,0,225,22,7,283,38,59,202,144,132,9)) table.7.9a<-structure(.Data=menuUnstackColumns(source=table.7.9, source.col.spec=c("freq"), group=c("y"), show.p=F), row.names=x, names=unique(levels(y))) y<-rep(rep(y,5),table.7.9$freq) x<-rep(x,tapply(table.7.9$freq, table.7.9$x, sum)) u<-cr.setup(y) y.u<-u$y x.u<-x[u$subs] #j=1 fit1<-lrm(y.u[u$cohort=="all"]~x.u[u$cohort=="all"]) fit1$deviance # For j=2 fit2<-lrm(y.u[u$cohort=="y>=malformed"]~x.u[u$cohort=="y>=malformed"]) # To fit both models together, fit an interaction term, as in the following. fit<-lrm(y.u~u$cohort*x.u) # To get selected odds ratios for the j=2 model, first issue the datadist command # and reissue the lrm call, as follows x.u<-x.u[u$cohort=="y>=malformed"] dd<-datadist(x.u) options(datadist='dd') fit<-lrm(y.u[u$cohort=="y>=malformed"]~x.u) summary(fit) summary(fit,x=c(125,250)) summary(fit,x=c(250,500)) ## using glm # Set x and the weights for the first linear logit model x<-c(0,62.5,125,250,500) n1<-rowSums(table.7.9a) # use the whole table # j=1 fit<-glm(table.7.9a[,1]/n1~x, family=binomial,link=logit,weights=n1) fit$null.deviance [1] 259.1073 fit$null.deviance-fit$deviance # this is what lrm gave us as model L.R. [1] 253.3298 summary(fit) ... Null Deviance: 259.1073 on 4 degrees of freedom Residual Deviance: 5.777478 on 3 degrees of freedom The difference of these sets of values gives Deviance = 253.3298 df=1 # For j=2, take the second and third columns of table.7.9a n2<-rowSums(table.7.9a[,c(2,3)]) glm(table.7.9a[,2]/n2~x, family=binomial,weights=n2) ### Mean response models fit.glm<-glm(jobsat~gender+income, weights=freq, family=poisson(identity), data=table.7.8) fit.aov<-aov(jobsat ~ gender + income, data = table.7.8, weights = freq) ## Using WLS directly menuUnstackColumns(source=as.data.frame(table.7.8), target=table.7.8a, source.col.spec=c("freq"), group=c("jobsat"), show.p=F) table.7.8a<-structure(.Data=table.7.8a,row.names=apply(expand.grid(income=1:4,gender=c(1,0)),1,paste,collapse=" " ),names=levels(table.7.8$jobsatf) ) n<-rowSums(table.7.8a) p<-sweep(table.7.8a,1,n,FUN="/") p1<-p[1,] p2<-p[2,] p3<-p[3,] p4<-p[4,] p5<-p[5,] p6<-p[6,] p7<-p[7,] p8<-p[8,] rm(p) J<-4 I<-8 # Compute V1, V2, V3, V4 Vnames<-sapply(1:I,function(x) paste("V", x, collapse = " ", sep = "")) pnames<-sapply(1:I,function(x) paste("p",x,collapse=" ",sep="")) V<-matrix(0,nc=J,nr=J) for(i in 1:I) { p<-eval(parse(text = pnames[i])) diag(V)<-p*(1-p) p<-as.matrix(p) junk<-matrix(-kronecker(p,p),nc=J,nr=J,byrow=T) V[lower.tri(diag(J))]<-junk[lower.tri(junk)] V<-t(V) V[lower.tri(V)]<-junk[lower.tri(junk,diag=F)] assign(Vnames[i], matrix(V/ n[i], ncol = J, byrow = T)) } junk<-matrix(0,J,J) V<-rbind( cbind(V1,junk,junk,junk,junk,junk,junk,junk), cbind(junk,V2,junk,junk,junk,junk,junk,junk), cbind(junk,junk,V3,junk,junk,junk,junk,junk), cbind(junk,junk,junk,V4,junk,junk,junk,junk), cbind(junk,junk,junk,junk,V5,junk,junk,junk), cbind(junk,junk,junk,junk,junk,V6,junk,junk), cbind(junk,junk,junk,junk,junk,junk,V7,junk), cbind(junk,junk,junk,junk,junk,junk,junk,V8) ) js<-1:4 Q<-rbind( c(js,rep(0,28)), c(rep(0,4),js,rep(0,24)), c(rep(0,8),js,rep(0,20)), c(rep(0,12),js,rep(0,16)), c(rep(0,16),js,rep(0,12)), c(rep(0,20),js, rep(0,8)), c(rep(0,24),js, rep(0,4)), c(rep(0,28),js)) VF<-Q%*%V%*%t(Q) # Design matrix X<-as.matrix(cbind(rep(1,I),unique(table.7.8[,1:2]))) # Functions Fp<-c(js%*%p1, js%*%p2, js%*%p3, js%*%p4, js%*%p5, js%*%p6, js%*%p7, js%*%p8) # Estimate beta b<-as.numeric(solve(t(X)%*%solve(VF)%*%X)%*%t(X)%*%solve(VF)%*%Fp) [1] 2.61328732 -0.04155966 0.18163655 #ASCOV sqrt(diag(solve(t(X)%*%solve(VF)%*%X))) [1] 0.2311374 0.1369473 0.0681758 # W for entire model t(Fp-X%*%b)%*%solve(VF)%*%(Fp-X%*%b) [,1] [1,] 5.242789 ### Generalized CMH Tests # mantelhaen.test only applies to 2x2x2 tables. Here it is calculated for the general association, using modified code from the function: table.7.8.array<-crosstabs(freq~jobsatf+income+gender, data=table.7.8) n <- m <- double(length = 9) # df=9 V <- matrix(0, nr = 9, nc = 9) # df=9 for (k in 1:2) { # number strata = 2 f <- table.7.8.array[, , k] ntot <- sum(f) rowsums <- apply(f, 1, sum)[-4] colsums <- apply(f, 2, sum)[-4] n <- n + c(f[-4, -4,]) m <- m + c(outer(rowsums, colsums, "*"))/ntot V<- V + (kronecker(diag(ntot * rowsums, nrow = 4-1) - outer(rowsums, rowsums), diag(ntot * colsums, nrow = 4-1) - outer(colsums, colsums))/(ntot^2 * (ntot - 1))) } n <- (n - m) (STATISTIC <- crossprod(n, solve(V, n))) [,1] [1,] 12.53139 # nonzero correlation table.7.8.array<-crosstabs(freq~jobsatf+income+gender, data=table.7.8) Tk<-apply(table.7.8.array,3,function(x,u,v) sum(outer(u,v)*x), u=c(1,3,4,5), v=c(3,10,20,35)) ETk<-apply(table.7.8.array,3,function(x,u,v) sum(rowSums(x)*u)*sum(colSums(x)*v)/sum(x), u=c(1,3,4,5), v=c(3,10,20,35)) varTk<-apply(table.7.8.array,3,function(x,u,v) { n<-sum(x) rowsums<-rowSums(x) colsums<-colSums(x) (sum(rowsums*u^2) - (sum(rowsums*u)^2)/n)*(sum(colsums*v^2) - (sum(colsums*v)^2)/n)/(n-1) }, u=c(1,3,4,5), v=c(3,10,20,35)) (sum(Tk-ETk)^2)/sum(varTk) ############# Chap. 8 - Agresti ################### ##### Loglinear models ## three-way tables # table 8.3 table.8.3<-data.frame(expand.grid( marijuana=factor(c("Yes","No"),levels=c("No","Yes")), cigarette=factor(c("Yes","No"),levels=c("No","Yes")), alcohol=factor(c("Yes","No"),levels=c("No","Yes"))), count=c(911,538,44,456,3,43,2,279)) # IPF library(mass) fitACM<-loglm(count~alcohol*cigarette*marijuana,data=table.8.3,param=T,fit=T) # ACM fitAC.AM.CM<-update(fitACM, .~. - alcohol:cigarette:marijuana) # AC, AM, CM fitAM.CM<-update(fitAC.AM.CM, .~. - alcohol:cigarette) # AM, CM fitAC.M<-update(fitAC.AM.CM, .~. - alcohol:marijuana - cigarette:marijuana) # AC, M fitA.C.M<-update(fitAC.M, .~. - alcohol:cigarette) # A, C, M # fitted frequencies data.frame(table.8.3[,-4], ACM=c(t(fitted(fitACM))), AC.AM.CM=c(t(fitted(fitAC.AM.CM))), AM.CM=c(t(fitted(fitAM.CM))), AC.M=c(t(fitted(fitAC.M))), A.C.M=c(t(fitted(fitA.C.M)))) # conditional odds ratios for model AC,AM,CM fit.array<-fitted(fitAC.AM.CM) apply(fit.array,1,function(x) x[1,1]*x[2,2]/(x[2,1]*x[1,2])) # CM (given level of A) apply(fit.array,2,function(x) x[1,1]*x[2,2]/(x[2,1]*x[1,2])) # AM apply(fit.array,3,function(x) x[1,1]*x[2,2]/(x[2,1]*x[1,2])) # AC # marginal odds ratios sum.array<-function(array, perm=c(3,2,1)){ res<-aperm(array,perm) colSums(res) } odds.ratio<-function(x) x[1,1]*x[2,2]/(x[2,1]*x[1,2]) odds.ratio(sum.array(fit.array)) # AC (sum over M, the faces) odds.ratio(sum.array(fit.array, perm=1:3)) # CM (sum over A, the rows) odds.ratio(sum.array(fit.array, perm=c(2,1,3))) # AM (sum over C, the columns) # loglin fit.loglin<-loglin(fitted(fitACM), margin=list(1:2, 2:3, c(1,3)),fit=T,param=T) # glm options(contrasts=c("contr.treatment","contr.poly")) fit.glm<-glm(count~.^2, data=table.8.3, family=poisson) ## GOF tests summary(fitAC.AM.CM) fit.loglin summary(fit) anova(fitAC.M,fitAC.AM.CM,fitAM.CM,fitA.C.M) ## parameter estimates and ASEs # loglm fitAC.AM.CM$param X<-model.matrix(count~(alcohol+cigarette+marijuana)^2,data=table.8.3,contrasts=list(alcohol=as.matrix(c(1,0)), marijuana=as.matrix(c(1,0)), cigarette=as.matrix(c(1,0)))) sqrt(diag(solve(t(X)%*%diag(c(fitAC.AM.CM$fitted))%*%X))) # glm fit.glm2<-update(fit.glm, contrasts=list(alcohol=as.matrix(c(1,0)), marijuana=as.matrix(c(1,0)), cigarette=as.matrix(c(1,0)))) summary(fit.glm2, cor=F) ## higher dimensional tables table.8.8<-data.frame(expand.grid(belt=c("No","Yes"), location=c("Urban","Rural"), gender=c("Female","Male"), injury=c("No","Yes")), count=c(7287,11587,3246,6134,10381,10969,6123,6693,996,759,973,757,812,380,1084,513)) library(mass) fitG.I.L.S<-loglm(count~., data=table.8.8, fit=T, param=T) # mutual independence fitGI.GL.GS.IL.IS.LS<-update(fitG.I.L.S, .~.^2, data=table.8.8, fit=T, param=T) # all pairwise associations fitGIL.GIS.GLS.ILS<-update(fitG.I.L.S, .~.^3, data=table.8.8, fit=T, param=T) # all three-way associations # comparison anova(fitG.I.L.S, fitGI.GL.GS.IL.IS.LS,fitGIL.GIS.GLS.ILS) # try another (fitGI.IL.IS.GLS<-update(fitGI.GL.GS.IL.IS.LS, .~.+ gender:location:belt)) fitted(fitGI.IL.IS.GLS) # conditional odds ratios for model GI.IL.IS.GLS fit.array<-fitted(fitGI.IL.IS.GLS) odds.ratio<-function(x) x[1,1]*x[2,2]/(x[2,1]*x[1,2]) apply(fit.array,c(1,4),odds.ratio) # GL S (same for I = yes or no - column) apply(fit.array,c(2,4),odds.ratio) # GS L (same for I = yes or no - column) apply(fit.array,c(3,4),odds.ratio) # LS G (same for I = yes or no - column) apply(fit.array,c(1,2),odds.ratio) # GI (same for each combination of LS) apply(fit.array,c(1,3),odds.ratio) # IL (same for each combination of GS) apply(fit.array,c(2,3),odds.ratio) # IS (same for each combination of GL) # dissimilarity matrix Fitted.values<-c(fit.array) sum(abs(table.8.8$count-Fitted.values))/(2*sum(table.8.8$count)) ### Loglinear models as Logit Models options(contrasts=c("contr.treatment","contr.poly")) # loglinear model fit.loglinear<-glm(count~.^2 + gender:location:belt, data=table.8.8, family=poisson) # logit model fit.logit<-glm(injury~gender+belt+location, data=table.8.8, family=binomial, weight=count) fit.loglinear$coefficients fit.logit$coefficients ### contingency table standardization table.8.15<-data.frame(expand.grid(Attitude=factor(c("Disapprove","Middle","Approve"), levels=c("Disapprove","Middle","Approve")), Education=factor(c("HS"), levels=c("HS"))), count=c(209,101,237,151,126,426,16,21,138)) fit<-glm(I(rep(100/3, 9))~Attitude+Education+offset(log(count)), family=poisson, data=table.8.15) cbind(table.8.15,std=fitted(fit)) ############# Chap. 9 - Agresti ################### ## model selection (student survey data) table.9.1<-data.frame(expand.grid(cigarette=c("Yes","No"), alcohol=c("Yes","No"),marijuana=c("Yes","No"), sex=c("female","male"), race=c("white","other")), count=c(405,13,1,1,268,218,17,117,453,28,1,1,228,201,17,133,23,2,0,0,23,19,1,12,30,1,1,0,19,18,8,17)) library(mass) fit.GR<-glm(count~ . + sex*race,data=table.9.1,family=poisson) # mutual independence + GR fit.homog.assoc<-glm(count~ .^2,data=table.9.1,family=poisson) # homogeneous association summary(res<-stepAIC(fit.homog.assoc, scope= list(lower = ~ + cigarette + alcohol + marijuana + sex*race), direction="backward")) fit.AC.AM.CM.AG.AR.GM.GR.MR<-res fit.AC.AM.CM.AG.AR.GM.GR<-update(fit.AC.AM.CM.AG.AR.GM.GR.MR, ~. - marijuana:race) ## residuals (standardized pearson) res.model6<-resid(fit.AC.AM.CM.AG.AR.GM.GR, type="pearson")/sqrt(1-lm.influence(fit.AC.AM.CM.AG.AR.GM.GR.MR)$hat) fit.model6<-fitted(fit.AC.AM.CM.AG.AR.GM.GR) data.frame(table.9.1, fitted=fit.model6, residuals=res.model6) resid(fit.AC.AM.CM.AG.AR.GM.GR, type="pearson")/sqrt(1-lm.influence(fit.AC.AM.CM.AG.AR.GM.GR.MR)$hat) fitted(fit.AC.AM.CM.AG.AR.GM.GR) resid(fit.AC.AM.CM.AG.AR.GM.GR.MR, type="pearson")/sqrt(1-lm.influence(fit.AC.AM.CM.AG.AR.GM.GR.MR)$hat) fitted(fit.AC.AM.CM.AG.AR.GM.GR.MR) ### Modeling ordinal associations # table 9.3 table.9.3<-read.table("c:/program files/r/rw1070/cda/sex.txt", col.names=c("premar","birth","u","v","count")) table.9.3$birth<-5-table.9.3$birth # rearrange scores so that table starts at 1,1 in the upper left corner # create uniform scores table.9.3$u1<-table.9.3$premar table.9.3$v1<-table.9.3$birth # need factors for premar and birth. Code levels so that 4th level is set to zero table.9.3$premar<-factor(table.9.3$premar, levels=4:1) table.9.3$birth<-factor(table.9.3$birth, levels=4:1) # Fit the Uniform Association model using scores for the variables only in the interaction. Note the use of ':' options(contrasts=c("contr.treatment","contr.poly")) fit.ua<- glm(count ~ premar + birth + u1:v1, data=table.9.3, family=poisson,link="log") # local association parameter exp(coef(fit.ua)["u1:v1"]) # Test of the association parameter: summary(fit.ua)$coef["u1:v1",] exp(summary(fit.ua)$coef["u1:v1",1]+1.96*c(-1,1)*summary(fit.ua)$coef["u1:v1",2]) # nonlocal association parameter exp(coef(fit.ua)["u1:v1"]*(4-1)*(4-1)) # fitted values matrix(fitted(fit.ua),byrow=T,ncol=4,dimnames=list(PreMar=c("Always wrong", "Almost always wrong", "Wrong sometimes", "Not wrong"), Teen Birth=c("SA","A","D","SD"))) # Independence model fit.ind<-glm(count ~ premar + birth, data=table.9.3, family=poisson,link="log") # Compare the two models: anova(fit.ind,fit.ua,test="Chi") ## Row effects model (same effect for the entire row) # Table 9.5 table.9.5<-data.frame(expand.grid(Affil=factor(c("Democrat","Independent","Republican"),levels=c("Republican","Independent","Democrat")), Ideology=factor(c("Liberal","Moderate","Conservative"), levels=c("Conservative","Moderate","Liberal"))), c.Ideo=rep(1:3,each=3), count=c(143,119,15,156,210,72,100,141,127)) # Row Effects model: options(contrasts=c("contr.treatment","contr.poly")) (fit.RE<-glm(count~Ideology+Affil*c.Ideo,family=poisson,data=table.9.5)) # predicted logits res<-matrix(fit.RE$linear.predictors,byrow=F,ncol=3) mat<-matrix(c(res[,2]-res[,1],res[,3]-res[,2]),byrow=T,ncol=3) # fitted values matrix(fitted(fit.RE),byrow=F,ncol=3,dimnames=list(Affiliation=c("Democrat","Independent","Republican"),Ideology=c("Liberal","Moderate","Conservative"))) # Comparing independence and Row effects model: fit.ind<-glm(count~Affil+Ideology,family=poisson,data=table.9.5) anova(fit.ind,fit.RE) ## Multi-way-tables # Table 9.7 table.9.7<-data.frame(expand.grid(Status=factor(c("Never","Former","Current"), levels=c("Current","Former","Never",)), Breath=factor(c("Normal","Borderline","Abnormal"), levels=c("Abnormal","Borderline","Normal")), Age=factor(c("< 40","40-50"), levels=c("40-50","< 40"))), c.Status=rep(1:3, 6), c.Breath=rep(rep(1:3,each=3),2), count=c(577,192,682,27,20,46,7,3,11,164,145,245,4,15,47,0,7,27) ) options(contrasts=c("contr.treatment","contr.poly")) fit.hetero<-glm(count~Age*Breath + Status*Age + Age:c.Breath:c.Status, data=table.9.7, family=poisson) summary(fit.hetero, cor=F) (fit.homo<-glm(count~Age*Breath + Status*Age + c.Breath:c.Status, data=table.9.7, family=poisson)) # ordered strata (Z) table.9.8<-data.frame(expand.grid(Age=factor(c("20-24","25-29","30-34","35-39","40-44","45-49","50-54","55-59","60-64"), levels=c("20-24","25-29","30-34","35-39","40-44","45-49","50-54","55-59","60-64")), Wheeze=factor(c("Yes","No"), levels=c("Yes", "No")), Breath=factor(c("Yes","No"), levels=c("Yes", "No")) ), c.Breath=rep(1:2, each=18), c.Wheeze=rep(rep(1:2,ea=9),2), extra.term=c(1:9,rep(0,27)), count=c(9,23,54,121,169,269,404,406,372,7,9,19,48,54,88,117,152,106,95,105,177,257,273,324,245,225,132,1841,1654,1863,2357,1778,1712,1324,967,526) ) fit.ord.strata<-glm(count~Age*Breath + Wheeze*Age + c.Breath:c.Wheeze + extra.term, data=table.9.8, family=poisson) summary(fit.ord.strata, cor=F) ### RC Models (iterative estimation: see R script for use of function grc) # start with initial column scores (c.MH) table.9.9<-data.frame(expand.grid(MH=factor(c("well","mild","moderate","impaired"), levels=rev(c("well","mild","moderate","impaired"))), SES=factor(LETTERS[1:6], levels=rev(LETTERS[1:6]))), c.MH=rep(4:1,6), count=c(64,94,58,46,57,94,54,40,57,105,65,60,71,141,77,94,36,97,54,78,21,71,54,71)) # fit R model (fit.RE2<-glm(count~MH+SES*c.MH,family=poisson,data=table.9.9)) # get the estimated row scores and fit C model row.scores<-rev(c(0,coef(fit.RE2)[c("SESE:c.MH","SESD:c.MH","SESC:c.MH","SESB:c.MH", "SESA:c.MH")])) (fit.CE2<-glm(count~SES+MH*row.scores,family=poisson,data=table.9.9)) # get the estimated column scores and fit R model col.scores<-rev(c(0, coef(fit.CE2)[c("MHmoderate:c.SES","MHmild:c.SES","MHwell:c.SES")])) (fit.RE2<-glm(count~MH+SES*col.scores,family=poisson,data=table.9.9)) # etc..... until convergence ## Correlation models # do not reverse the levels of factors table.9.9b<-data.frame(expand.grid(MH=factor(c("well","mild","moderate","impaired"), levels=(c("well","mild","moderate","impaired"))), SES=factor(LETTERS[1:6], levels=(LETTERS[1:6]))), count=c(64,94,58,46,57,94,54,40,57,105,65,60,71,141,77,94,36,97,54,78,21,71,54,71)) corresp(x=design.table(table.9.9b),nf=1) ### Correspondence Analysis ### # multiv library library(multiv) table.9.9b.array<-t(design.table(table.9.9b)) # transposed the default fit.ca<-ca(table.9.9b.array, nf=3) # plot of first and second factors plot(fit.ca$rproj[,1], fit.ca$rproj[,2],type="n",ylim=c(-.1,.1),xlim=c(-.3,.3), xlab="",ylab="",axes=F) text(fit.ca$rproj[,1], fit.ca$rproj[,2], labels=dimnames(table.9.9b.array)$SES) text(fit.ca$cproj[,1], fit.ca$cproj[,2], labels=dimnames(table.9.9b.array)$MH) # Place additional axes through x=0 and y=0: my.plaxes(fit.ca$rproj[,1], fit.ca$rproj[,2],size=.15) my.plaxes<- function(a, b, size = 0.1) { arrows(min(a), 0, max(a), 0, size = size) arrows(0, min(b), 0, max(b), size = size) } # corresp from mass fit.corresp<-corresp(x=design.table(table.9.9b),nf=3) fit.corresp$cor fit.corresp$rscore%*%diag(fit.corresp$cor) fit.corresp$cscore%*%diag(fit.corresp$cor) plot(fit.corresp) ### Analyzing Rates ### table.9.11<-data.frame(expand.grid(factor(c("Aortic","Mitral"),levels=c("Aortic","Mitral")), factor(c("<55","55+"),levels=c("<55","55+"))), Deaths=c(4,1,7,9), Exposure=c(1259,2082,1417,1647)) names(table.9.11)[1:2]<-c("Valve","Age") attach(table.9.11) table.9.11<-data.frame(table.9.11,Risk=Deaths/Exposure) options(contrasts=c("contr.treatment", "contr.poly")) fit.rate<-glm(Deaths~Valve+Age+offset(log(Exposure)),family=poisson,data=table.9.11) summary(fit.rate, cor=F) library(mass) exp(confint(fit.rate)) # chi-squared statistic, use resid() sum(resid(fit.rate,type="pearson")^2) # Table 9.12 attach(table.9.11) mhat<-fitted(fit.rate) exphat<-fitted(fit.rate)/Exposure temp<-rbind(mhat,exphat) array(temp,dim=c(2,2,2),dimnames=list(c("Deaths","Risk"),Valve=c("Aortic","Mitral"),Age=c("<55","55+"))) # identity link fit.id<-glm(Deaths~I(codes(Valve)*Exposure)+I(rev(codes(Age))*Exposure)+Exposure-1,family=poisson(link=identity),data=table.9.11) summary(fit.id, cor=F) ### Modelling survival times ### # Table 9.13 # form the data set table.9.13<-expand.grid(Stage=factor(c(1,2,3), levels=3:1),Histology=factor(c("I","II","III"), levels=rev(c("I","II","III"))), Time=factor(c(0,2,4,6,8,10,12), levels=rev(c(0,2,4,6,8,10,12)))) table.9.13<-data.frame(table.9.13,Deaths=c(9,12,42,5,4,28,1,1,19,2,7,26,2,3,19,1,1,11,9,5,12,3,5,10,1,3,7, 10,10,10,2,4,5,1,1,6,1,4,5,2,2,0,0,0,3,3,3,4,2,1,3,1,0,3,1,4,1,2,4,2,0,2,3), Exposure=c(157,134,212,77,71,130,21,22,101,139,110,136,68,63,72,17,18,63,126,96,90,63,58,42,14,14,43,102,86,64, 55,42,21,12,10,32,88,66,47,50,35,14,10,8,21,82,59,39,45,32,13,8,8,14,76,51,29,42,28,7,6,6,10) ) # loglinear analysis using glm() options(contrasts=c("contr.treatment", "contr.poly")) fit.surv<-glm(Deaths~Histology+Time+Stage+offset(log(Exposure)),data=table.9.13,family=poisson) summary(fit.surv, cor=F) # choose a model using stepAIC from the Mass library fit2<-glm(Deaths~Time+offset(log(Exposure)),data=table.9.13,family=poisson) library(MASS) stepAIC(fit.surv,scope=list(lower=formula(fit2),upper=~.^2),direction="both",trace=T)$anova ### Sparse tables ### table.9.16<-data.frame(expand.grid(treatment=c(1,0), Center=factor(1:5,levels=5:1)),prop=c(0,0,1/13,0,0,0,6/9,2/8,5/14,2/14)) options(contrasts=c("contr.treatment", "contr.poly")) fit.glm.sparse<-glm(prop~treatment+Center-1,data=table.9.16,weights=c(5,9,13,10,7,5,9,8,14,14),family=binomial) summary(fit.glm.sparse) fit.glm.sparse2<-glm(prop~Center-1,data=table.9.16,weights=c(5,9,13,10,7,5,9,8,14,14),family=binomial) anova(fit.glm.sparse2,fit.glm.sparse) # exact conditional test temp<-length(20) temp[rep(c(T,F),10)]<-c(0,0,1,0,0,0,6,2,5,2) temp[rep(c(F,T),10)]<-c(5,9,12,10,7,5,3,6,9,12) table.9.16a<-data.frame(table.9.16[rep(1:10,c(5,9,13,10,7,5,9,8,14,14)),1:2],case=rep(rep(c(1,0),10),temp)) coxph(Surv(rep(1,sum(temp)),case)~treatment+strata(Center), data=table.9.16a, method="exact") ############# Chap. 10 - Agresti ################### ## Comparing dependent proportions table.10.1<-matrix(c(794,150,86,570),byrow=T,ncol=2) mcnemar.test(table.10.1,correct=F) margin.table<-function (x, margin = NULL) { if (!is.array(x)) stop("x is not an array") if (length(margin)) { z <- apply(x, margin, sum) dim(z) <- dim(x)[margin] dimnames(z) <- dimnames(x)[margin] } else return(sum(x)) class(z) <- oldClass(x) z } prop.table<-function (x, margin = NULL) { if (length(margin)) sweep(x, margin, margin.table(x, margin), "/") else x/sum(x) } table.10.1.prop<-prop.table(table.10.1) prop.diff<-margin.table(table.10.1.prop,2)[1]-margin.table(table.10.1.prop,1)[1] off.diag<-diag(table.10.1.prop[1:2,2:1]) prop.diff + c(-1,1)*qnorm(.975)*sqrt((sum(off.diag) - diff(off.diag)^2)/sum(table.10.1)) ## CLR matched case-control table.10.3<-data.frame(pair=rep(1:144,rep(2,144)), MI=rep(c(0,1),144), diabetes=c(rep(1,2*9),rep(c(1,0),16),rep(c(0,1),37),rep(0,2*82)) ) fit.CLR<-coxph(Surv(rep(1,2*144),MI)~diabetes+strata(pair),method="exact", data=table.10.3) summary(fit.CLR) ## GLMM library(MASS) fit.glmmPQL<-glmmPQL(MI ~ diabetes, random = ~ 1 | pair, family = binomial, data = table.10.3) summary(fit.glmmPQL) ### Marginal models for square tables ## ordinal models (cumulative logit) see R script ### nominal models (baseline-category logit) # ML fit dummies<-matrix(c( 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 1, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow=16, ncol=((4-1)^2) + 1 +3) dummies<-data.frame(counts=c(11607,100,366,124,87,13677,515,302,172,225,17819,270,63,176,286,10192),dummies) names(dummies)<-c("counts","m11","m12","m13","m21","m22","m23","m31","m32","m33","m44","m1","m2","m3") fit<-glm(counts~.-1,family=poisson(identity),data=dummies) residence80<-(c("NE","MW","S","W")) residence85<-(c("NE","MW","S","W")) matrix(fitted(fit),byrow=T,nc=4,dimnames=list(residence80,residence85)) rowSums(matrix(fitted(fit),byrow=T,nc=4)) colSums(matrix(fitted(fit),byrow=T,nc=4)) # Bhapkar's method (Wald test) A<-matrix(c(0,1,1,1,-1,0,0,0,-1,0,0,0,-1,0,0,0, 0,1,0,0,-1,0,-1,-1,0,1,0,0,0,1,0,0, 0,0,1,0,0,0,1,0,-1,-1,0,-1,0,0,1,0),nc=16,nr=3,byrow=T) counts<-c(11607,100,366,124,87,13677,515,302,172,225,17819,270,63,176,286,10192) p<-counts/(n<-sum(counts)) y<-A%*%p Sp<--outer(p,p)/n diag(Sp)<-p*(1-p)/n Sy<-A%*%Sp%*%t(A) 1-pchisq(as.numeric(t(y)%*%solve(Sy)%*%y),df=3) ## Symmetry models residence80<-factor(residence80, levels=residence80) residence85<-residence80 table.10.6<-expand.grid(res80=residence80,res85=residence85) table.10.6$counts<-c(11607,100,366,124,87,13677,515,302,172,225,17819,270,63,176,286,10192) table.10.6$symm<-paste( pmin(as.numeric(table.10.6$res80),as.numeric(table.10.6$res85)), pmax(as.numeric(table.10.6$res80),as.numeric(table.10.6$res85)),sep=",") table.10.6$symm<-factor(table.10.6$symm, levels=rev(table.10.6$symm)) options(contrasts=c("contr.treatment", "contr.poly")) (fit.symm<-glm(counts~symm,family=poisson(log),data=table.10.6)) ## Quasi-Symmetry Model options(contrasts = c("contr.treatment", "contr.poly")) table.10.6$res80a<-factor(table.10.6$res80, levels=rev(residence80)) table.10.6$res85a<-factor(table.10.6$res85, levels=rev(residence80)) (fit.qsymm<-glm(counts~symm+res80a,family=poisson(log),data=table.10.6)) 1-pchisq(fit.qsymm$deviance, df=fit.qsymm$df) 1-pchisq(fit.symm$deviance-fit.qsymm$deviance, df=fit.symm$df-fit.qsymm$df) # fitted values for quasi-symmetry model matrix(fitted(fit.qsymm),ncol=4,byrow=T,dimnames=list(rev(levels(residence80)),rev(levels(residence85)))) ## Quasi-independence model table.10.6$D1<-as.numeric(table.10.6$symm=="1,1") table.10.6$D2<-as.numeric(table.10.6$symm=="2,2") table.10.6$D3<-as.numeric(table.10.6$symm=="3,3") table.10.6$D4<-as.numeric(table.10.6$symm=="4,4") options(contrasts = c("contr.treatment", "contr.poly")) (fit.qi<-glm(counts~res80a+res85a+D1+D2+D3+D4,family=poisson(log),data=table.10.6)) ## Square Tables with Ordered Categories table.10.5<-data.frame(expand.grid(PreSex=factor(1:4),ExSex=factor(1:4)),counts=c(144, 33, 84, 126, 2, 4 ,14 ,29 ,0 ,2 ,6 ,25, 0, 0, 1, 5)) # quasi-symmetry (ordinal) table.10.5$symm<-paste( pmin(as.numeric(table.10.5$PreSex),as.numeric(table.10.5$ExSex)), pmax(as.numeric(table.10.5$PreSex),as.numeric(table.10.5$ExSex)),sep=",") table.10.5$symm<-factor(table.10.5$symm, levels=rev(table.10.5$symm)) table.10.5$scores<-rep(1:4,each=4) options(contrasts = c("contr.treatment", "contr.poly")) fit.oqsymm<-glm(counts~symm + scores,data=table.10.5,family=poisson(log)) # Conditional symmetry model # Get tau temp<-matrix(0,nr=4,nc=4) tau<-as.numeric(row(temp)