# R code used in solutions to exam 2 # Load the data, check that it has the right properties x = read.csv("data.csv", row.names=1) dim(x) colnames(x) ### Problem 1a # Create Q-Q plots for each observable par(mfrow = c(4,3), mar = c(2,2,2,2)) for(k in 1:ncol(x)) { qqnorm(x[,k], xlab = "", ylab = "", main = colnames(x)[k]) qqline(x[,k], col = 2) } ### Problem 2a # Load np library, fit non-parametric joint density; do not let np go crazy in # optimizing the bandwidth require(np) np.dens = npudens(~X.1+X.10, data = x, tol=0.05,ftol=0.05) # Let np create a 50x50 plotting grid; extract it and the values of the density # along that grid fhat = plot(np.dens, plot.behavior="data") np.plot = fhat$d1 np.grid = cbind(np.plot$eval$Var1, np.plot$eval$Var2) # Pretty contour plot require(lattice) contourplot(np.plot$dens~np.grid[,1]*np.grid[,2],cuts=20, xlab="X.1", ylab="X.10", labels=list(cex=0.5), main = "Non parametric joint density estimate") ### Problem 2b # Find MLEs for the mean vector and covariance matrix of a Gaussian here; # these are just the sample values means = colMeans(x[,c(1,10)]) cov.mat = cov(x[,c(1,10)]) # Plot the Gaussian density on the same grid of points require(mvtnorm) fitted.gauss = dmvnorm(np.grid,means,cov.mat) contourplot(fitted.gauss~np.grid[,1]*np.grid[,2],cuts=20, xlab="X.1", ylab="X.10", labels=list(cex=0.6), main = "Fitted Gaussian joint density estimate") ### Problem 2c # Plot the difference between the two density estimates on this grid diff.np.gauss = fitted.gauss - np.plot$dens contourplot(diff.np.gauss~np.grid[,1]*np.grid[,2],cuts=20, xlab="X.1", ylab="X.10", labels=list(cex=0.6), main = "Gaussian fit minus non parametric fit") ### Problem 2e (extra credit) # Bootstrap the log-likelihood ratio # Calculate difference in log-likelihoods on a common data set: diff.log.like <- function(data) { my.np <- npudens(~X.1+X.10,data=data,tol=0.1,ftol=0.1) my.means <- colMeans(data) my.cov <- cov(data) np.ll <- sum(log(predict(my.np))) gauss.ll <- sum(log(dmvnorm(data,my.means,my.cov))) return(np.ll - gauss.ll) } # Simulate data from the fitted Gaussian: p2sim <- function() { sim <- rmvnorm(nrow(x),means,cov.mat) sim <- data.frame(X.1=sim[,1],X.10=sim[,2]) return(sim) } # Do the simulation 100 times test.stats <- replicate(100,diff.log.like(p2sim())) ### Problem 3 # Do the PCA x.pca = prcomp(x) ### Problem 3a plot(x.pca$rotation[,1],type="l",col=1, ylim = c(-1, 1), lwd = 2, xlab = "Column index", ylab = "Principal component projection", main = "Projection of components onto the 10 variables") lines(x.pca$rotation[,2],type="l",col=2, lwd = 2) lines(x.pca$rotation[,3],type="l",col=3, lwd = 2, lty = 2) lines(x.pca$rotation[,4],type="l",col=4, lwd = 2, lty = 3) lines(x.pca$rotation[,5],type="l",col=6, lwd = 2, lty = 4) legend(1, 1.1, c("Component 1", "Component 2", "Component 3", "Component 4", "Component 5"), col = c(1,2,3,4,6), text.col = c(1,2,3,4,6), lty = c(1,1,2,3,4), bty = "n") ### Problem 3b cumulative.pct = cumsum(x.pca$sdev^2)/sum(x.pca$sdev^2) plot(1:10, cumulative.pct, type = "b", xlab = "Number of components", ylab = "Fraction of variance retained by principal components") ### Problem 4a x.fa = factanal(x,factors=1) plot(x.pca$rotation[,1],type="l",col=2, ylim = c(0, 0.8), lwd = 2, xlab = "Column index", ylab = "Projection") lines(x.fa$loadings,type="l",lwd = 2) legend(1, 0.7, c("FA first factor", "PCA first component"), col = c(1,2), text.col = c(1,2), lty = c(1,1), bty = "n") ### Problem 4b x.fa2 = factanal(x,factors=2) plot(x.fa2$loadings[,1],type="l", ylim = c(0, 0.8), lwd = 2, xlab = "Column index", ylab = "Projection") lines(x.fa2$loadings[,2],type="l",lwd = 2, col=2) lines(x.fa$loadings,type="l",lwd = 1, col=3, lty = 2) legend(1, 0.7, c("First factor", "Second factor", "Former first factor"), col = c(1,2,3), text.col = c(1,2,3), lty = c(1,1,2), bty = "n") ### Problem 4c # What fraction of variance belongs to the factors? var.fraction <- function(fa) { psi <- fa$uniquenesses p <- length(psi) return(1-sum(psi)/p) } # Calculate that from scratch var.fraction.with.q.factors <- function(q,data=x) { return(var.fraction(factanal(data,factors=q))) } # for 1--5 factors cumulative.pct.factors <- sapply(1:5,var.fraction.with.q.factors) # and plot plot(1:5, cumulative.pct.factors, type = "l", ylim=c(0,1), xlab = "Number of factors or components", ylab = "Fraction of variance retained") lines(1:5, cumulative.pct[1:5], lty = 2) legend(1,0.8,legend=c("Factors","Principal Components"),lty=c(1,2),bty="n") ### Problem 5b # Calculate the covariance matrix implied by a factor model factor.cov <- function(fa) { w <- t(fa$loadings) Psi <- diag(fa$uniquenesses) return( t(w) %*% w + Psi) } # Probability density function for a factor model dfactanal <- function(x,fa) { require(mvtnorm) p <- ncol(x) return(dmvnorm(scale(x),mean=rep(0,p),sigma=factor.cov(fa))) } # Log-likelihood of data under factor model loglik.factanal <- function(x,fa) { return(sum(log(dfactanal(x,fa)))) } # Calculate log-likelihoods, test statistics, degrees of freedom logliks <- vector(length=5) num.params <- vector(length=5) p=10 for (q in 1:5) { logliks[q] <- loglik.factanal(x,factanal(x,factors=q)) num.params[q] <- p*q-q*(q-1)/2 } test.stats <- 2*diff(logliks) dfs <- diff(num.params) # p-values pchisq(test.stats,df=dfs,lower.tail=FALSE) ### Problem 6b # Fit two-component multivariate Gaussian mixture by EM # Inputs: data array (x) # maximum number of EM steps tried (t) # tolerance level: iteration stops when difference in posterior probability is # below this; adjusted to number of data points (eps) # Presumes: mixtools library is available for dmvnorm() # Output: See above myfunction <- function(x, t=100, eps=1e-2/sqrt(nrow(x))) { # Make surer mixtools loaded an x is an array; if either condition # is not met, quit and explain what's wrong: stopifnot(require(mixtools),is.array(x)) # Record the number of observations: n <- nrow(x) # Make w a vector of alternating 0's and 1's, one per observation w <- rep(c(0,1),length.out=n) # Re-arrange w randomly: w <- sample(w) # Set del so it is bigger than eps for any finite eps: del <- Inf # Initialize a counter: i <- 0 # Do EM as long as it has not converged to within eps, for up to t iterations while ((del > eps) && (i < t)) { # Keep track of how many iterations you've done: i <- i+1 # Get the mean of absolute values in w, the total weight of component 1 l1 <- mean(abs(w)) # The mean will always be in [0,1]; compute its complement: l2 <- 1 - l1 # Find the weighted mean of each column of x with weights w: m1 <- apply(x,2,weighted.mean,w=w) # Find the weighted mean of each column of x with complementary # weights 1-w: m2 <- apply(x,2,weighted.mean,w=(1-w)) # Compute the covariance matrix for x with rows weighted by w s1 <- cov.wt(x,wt=w)$cov # Compute the covariance matrix for x with row weights 1-w s2 <- cov.wt(x,wt=(1-w))$cov # For each row of x find likelihood of generating that row from # a Gaussian distribution with means m1 and covariances s1 p1 <- dmvnorm(x,m1,s1) # Compute the same likelihoods with m2, s2 p2 <- dmvnorm(x,m2,s2) # Make new weights. For each observation, the new weight is the likelihood # of the data coming from the first Gaussian over the total likelihood of # coming from either of the two proposed Gaussians, with likelihoods # weighted by the mixing weights of the components --- Bayes's rule wn <- l1*p1/(l1*p1+l2*p2) # Find the maximum difference between old and new weights; if this is # smaller than eps, we're done del <- max(abs(wn-w)) # Replace the old weights with the new weights w <- wn } # Return everything in a list return(list(m1=m1,m2=m2,s1=s1,s2=s2,l1=l1,l2=l2,w=w,t=i,del=del)) } ### Problem 7a library(mixtools) mix2 <- mvnormalmixEM(x,k=2,maxit=100,epsilon=1e-1) mix3 <- mvnormalmixEM(x,k=3,maxit=100,epsilon=1e-1) mix4 <- mvnormalmixEM(x,k=4,maxit=100,epsilon=1e-1) mix5 <- mvnormalmixEM(x,k=5,maxit=100,epsilon=1e-1) mix6 <- mvnormalmixEM(x,k=6,maxit=100,epsilon=1e-1) logliks.mixes <- c(mix2$loglik,mix3$loglik,mix4$loglik,mix5$loglik,mix6$loglik) loglik.mvn <- sum(log(dmvnorm(as.matrix(x),colMeans(x),cov(x)))) logliks.mixes <- c(loglik.mvn,logliks.mixes) plot(logliks.mixes,type="b",xlab="Number of mixture components", ylab="In-sample log likelihood") ### Problem 7c source("http://www.stat.cmu.edu/~cshalizi/402/lectures/20-mixture-examples/bootcomp.R") bc <- boot.comp(x,max.comp=6,mix.type="mvnormalmix",maxit=100,epsilon=1e-1,B=50) ### Problem 7d cv.factors.vs.mixtures <- function(n.folds=5,q.factors,k.mixes,x,...) { x <- scale(x) n <- nrow(x) case.folds <- rep(1:n.folds,length.out=n) case.folds <- sample(case.folds) factor.logliks <- vector(length=n.folds) mixture.logliks <- vector(length=n.folds) for (fold in 1:n.folds) { test <- x[case.folds == fold,] train <- x[case.folds != fold,] fa <- factanal(train,factors=q.factors) factor.logliks[fold] <- loglik.factanal(test,fa) if (k.mixes > 1) { mvnmix <- mvnormalmixEM(train,k=k.mixes,...) mixture.logliks[fold] <- loglike.mvnormalmix(test,mvnmix) } else { mu <- colMeans(train) sigma <- cov(train) mixture.logliks[fold] <- sum(log(dmvnorm(test,mu,sigma))) } } cv.mixture <- mean(mixture.logliks) cv.factor <- mean(factor.logliks) choice <- ifelse(cv.mixture > cv.factor,"mixture","factor") return(list(choice=choice,cv.mixture=cv.mixture,cv.factor=cv.factor, mixture.logliks=mixture.logliks,factor.logliks=factor.logliks)) }