# Multinomial Logit Analysis in SAS (Data simulation in Splus) # References: Agresti, 1990, Categorical Data Analysis, p.313 # http://wizard.ucr.edu/~rhannema/soc271/mlogit.html ######################## ### Define the model ### ######################## # Outcomes: Y in (1,2,3) # Covariates: categorical w in (1,2,3) continuous x in [0,1] # Parameters directly relate to log-odds of outcome k to last outcome b0_c(-1.0, -1.5) # intercepts for Y=1,2 respectively bw_matrix(c(0.7, 0.5, NA, # w coefficients for Y=1, w=(1,2,3) 0.9, -0.4, NA), # w coefficients for Y=2, w=(1,2,3) 2, 3, byrow=T) bw[,3]_-(bw[,1]+bw[,2]) # SAS parametrization: zero constraint bx_c(-0.4, 0.6) # x slopes for Y=1,2 respectively # for y=1 and for y=2: # LogOdds((Y=y/Y=3)|w,x)=b0[y]+ # (w==1)*bw[y,w] + (w==2)*bw[y,w] + (w==3)*bw[y,w]+ # bx[y]*x) # # Pr(Y=y|w,x)=Odds((Y=y/Y=3)|w,x) / # (Odds((Y=1/Y=3)|w,x)+Odds((Y=2/Y=3)|w,x)+1) ########################################################### ### Simulate data at W=1,2,3 X=0.0,0.2,0.4,0.6,0.8,1.0 ### ########################################################### n_500 # approximate count for each w/x combination # data has three lines for each w/x combination (1 for each value of y) # data format is x w y cnt (p) where (p) is added as a simulation check data_cbind(w=rep(1:3,rep(18,3)), x=rep(rep(seq(0,1,0.2),rep(3,6)),3), y=rep(1:3,18), cnt=rep(NA,54), p=rep(NA,54)) for (i in seq(1,54,3)) { gw_data[i,1] gx_data[i,2] n1_exp(b0[1]+(gw==1)*bw[1,gw]+(gw==2)*bw[1,gw]+(gw==3)*bw[1,gw]+bx[1]*gx) n2_exp(b0[2]+(gw==1)*bw[2,gw]+(gw==2)*bw[2,gw]+(gw==3)*bw[2,gw]+bx[2]*gx) den_n1+n2+1 p1_n1/(n1+n2+1) p2_n2/(n1+n2+1) m_rpois(1,n) data[i,5]_round(p1,2) data[i,4]_rbinom(1,m,p1) data[i+1,5]_round(p2,2) data[i+1,4]_rbinom(1,m-data[i,4],p2/(1-p1)) data[i+2,5]_round(1-p1-p2,2) data[i+2,4]_m-data[i,4]-data[i+1,4] } write.table(data,"MNLogit.dat",sep=" ",dimnames.write=F) ##################################################################### ### Analyze in SAS using MNLogit.sas (see results in MNLogit.lst) ### ##################################################################### ####################################################### ### Plot probability profiles: "true" and estimated ### ####################################################### # >> Here is where it is shown how to tell which parameter is which # >> and how to calculate log-odds and probabilities. # >> For multilevel categorical covariates with K levels of outcome and # >> I levels of covariate, we see I-1 groups with K-1 parameters in each group. # >> For each group, the missing Kth parameter is 0 (and exp(0)=1 hence # >> the "1" in the denominators below). We do need to calculate the # >> missing Ith group: for each outcome, the missing Ith group parameter # >> is minus the sum of the parameters for that outcome in all of the # >> I-1 other groups. # Manually enter SAS parameter estimates Sb0_c(-1.0408, -1.5601) Sbw_matrix(c(0.7359, 0.4969, NA, # parameters 3 and 5 0.9069, -0.4447, NA), # parameters 4 and 6 2, 3, byrow=T) Sbw[,3]_-(Sbw[,1]+Sbw[,2]) Sbx_c(-0.4389, 0.7146) xseq_seq(0,1,len=100) par(mfrow=c(1,3), oma=c(2,1,3,1)) for (W in 1:3) { n1_exp(b0[1]+(W==1)*bw[1,W]+(W==2)*bw[1,W]+(W==3)*bw[1,W]+bx[1]*xseq) Sn1_exp(Sb0[1]+(W==1)*Sbw[1,W]+(W==2)*Sbw[1,W]+(W==3)*Sbw[1,W]+Sbx[1]*xseq) n2_exp(b0[2]+(W==1)*bw[2,W]+(W==2)*bw[2,W]+(W==3)*bw[2,W]+bx[2]*xseq) Sn2_exp(Sb0[2]+(W==1)*Sbw[2,W]+(W==2)*Sbw[2,W]+(W==3)*Sbw[2,W]+Sbx[2]*xseq) den_n1+n2+1 Sden_Sn1+Sn2+1 p1_n1/den Sp1_Sn1/Sden p2_n2/den Sp2_Sn2/Sden p3_1-p1-p2 Sp3_1-Sp1-Sp2 plot(c(0,1),c(0,1),type="n", xlab="covariate x",ylab="probability",main=paste("covariate w=",W)) lines(xseq,p1) lines(xseq,p2) lines(xseq,p3) text(c(0,1),p1[c(1,100)],"Y=1") text(c(0,1),p2[c(1,100)],"Y=2") text(c(0,1),p3[c(1,100)],"Y=3") lines(xseq,Sp1,lty=2) lines(xseq,Sp2,lty=2) lines(xseq,Sp3,lty=2) } mtext("Multinomial Logit Model", outer=T, cex=1.25) mtext("Solid=\"true\", dotted=estimate", outer=2, side=1) dev.copy(postscript, file="MNLogit.ps", horiz=F) dev.off() ############################# ### Observed and Expected ### ############################# # SAS "population profile" shows that sample order matches Splus "data" variable ob_c(159,123, 164,138, 138,140, 100,177, 112,176, 94,209, 171,41, 168, 56, 138, 57, 130, 48, 144, 78, 106, 84, 42, 61, 37, 48, 38, 68, 32, 74, 28,102, 25,108) ex_c(162.3,114.6, 157.0,139.6, 135.7,151.9, 111.6,157.4, 106.1,188.4, 94.4,211.1, 170.9,39.6, 162.6,47.5, 145.0,53.4, 128.7,59.7, 137.7,80.4, 112.1,82.4, 43.8,56.4, 41.4,67.1, 33.7,68.8, 30.8,79.1, 27.6,89.1, 24.7,100.5) st_1 # Y=1 matrix(round((ob[seq(st,36,2)]-ex[seq(st,36,2)])/sqrt(ex[seq(st,36,2)]),2),6) # [,1] [,2] [,3] # [1,] -0.26 0.01 -0.27 # [2,] 0.56 0.42 -0.68 # [3,] 0.20 -0.58 0.74 # [4,] -1.10 0.11 0.22 # [5,] 0.57 0.54 0.08 # [6,] -0.04 -0.58 0.06 st_2 $ Y=2 matrix(round((ob[seq(st,36,2)]-ex[seq(st,36,2)])/sqrt(ex[seq(st,36,2)]),2),6) # [,1] [,2] [,3] # [1,] 0.78 0.22 0.61 # [2,] -0.14 1.23 -2.33 # [3,] -0.97 0.49 -0.10 # [4,] 1.56 -1.51 -0.57 # [5,] -0.90 -0.27 1.37 # [6,] -0.14 0.18 0.75 # Biggest residual is -2.33 for Y=2|X=0.2,W=3 # w x Y N Pr # 3 0.2 1 37 0.08 # 3 0.2 2 48 0.12 # 3 0.2 3 463 0.80 (463+48+37)*0.08 # 43.84 (463+48+37)*0.12 # 65.76 (463+48+37)*0.80 # 438.4 # But there are not really large residuals or patterns to the residuals # so the model appears to fit well.