# Take a quick look at the US in 1977 head(state.x77) summary(state.x77) # Do the PCA state.pca <- prcomp(state.x77,scale.=TRUE) # Look at the "biplot" showing projections of data points and how the # observables align with the components biplot(state.pca,cex=c(0.5,0.75)) # "Scree" plot of the variance on each component plot(state.pca,type="l") # Helper function for mapping the states with label sizes scaling with # a vector # Inputs: Vector of sizes, lower and upper limit limits for label sizes, # optional extra arguments to plot() # Outputs: Vector of scales sizes, invisibly plot.states_scaled <- function(sizes,min.size=0.4,max.size=2,...) { plot(state.center,type="n",...) out.range = max.size - min.size in.range = max(sizes)-min(sizes) scaled.sizes = out.range*((sizes-min(sizes))/in.range) text(state.center,state.abb,cex=scaled.sizes + min.size) invisible(scaled.sizes) } # Make a map based on the first principal component plot.states_scaled(state.pca\$x[,1],min.size=0.3,max.size=1.5, xlab="longitude",ylab="latitude") # What might we call this component? # Now look at a one-factor model of the same data # There are various ways of estimating the latent factor scores; the last # argument picks a particular one state.fa1 <- factanal(state.x77,factors=1,scores="regression") # Look at the estimates of the parameters state.fa1 # Map it again plot.states_scaled(state.fa1\$score[,1],min.size=0.3,max.size=1.5, xlab="longitude",ylab="latitude") # How many factors? # Can't fit more than 4 factors to this few variables... # factanal() automatically runs a goodness-of-fit test; this assumes # everything's Gaussian, but it can work reasonaby if non-Gaussian distributions # aren't too non-Gaussian, and there's enough data (though n=50 is a bit # dubious) pvalues <- sapply(1:4,function(q){factanal(state.x77,factors=q)\$PVAL}) signif(pvalues,2) plot(1:4,pvalues,xlab="q (number of factors)", ylab="pvalue", log="y",ylim=c(1e-11,0.04)) abline(h=0.05,lty=2) # Note: with PCA, adding on extra components doesn't change the earlier ones # This is NOT true of factor models; we should go back and look at the first # factor when using more