## ----include=FALSE------------------------------------------------------- library(knitr) opts_chunk\$set(size="small",background="white", highlight=FALSE, cache=TRUE, autodep=TRUE, tidy=TRUE, warning=FALSE, message=FALSE) ## ------------------------------------------------------------------------ nyt.frame <- read.csv("http://www.stat.cmu.edu/~cshalizi/dm/19/lectures/15/nyt.frame.csv") dim(nyt.frame) class.labels <- c(rep("art",57),rep("music",45)) nyt.frame <- data.frame(class.labels=as.factor(class.labels),nyt.frame) dim(nyt.frame) ## ------------------------------------------------------------------------ # Calculate the entropy of a vector of counts or proportions # Inputs: Vector of numbers # Output: Entropy (in bits) entropy <- function(p) { # Assumes: p is a numeric vector if (sum(p) == 0) { return(0) # Case shows up when calculating conditional # entropies } p <- p/sum(p) # Normalize so it sums to 1 p <- p[p > 0] # Discard zero entries (because 0 log 0 = 0) H <- -sum(p*log(p,base=2)) return(H) } ## ------------------------------------------------------------------------ entropy(c(0.5,0.5)) entropy(c(1,1)) entropy(c(45,45)) ## ------------------------------------------------------------------------ entropy(c(57,45)) ## ------------------------------------------------------------------------ table(nyt.frame[,"class.labels"]) entropy(table(nyt.frame[,"class.labels"])) ## ------------------------------------------------------------------------ # Get the expected information a word's indicator gives about a # document's class # Inputs: array of indicator counts # Calls: entropy() # Outputs: mutual information word.mutual.info <- function(counts) { # Assumes: counts is a numeric matrix # get the marginal entropy of the classes (rows) C marginal.entropy = entropy(rowSums(counts)) # Get the probability of each value of X probs <- colSums(counts)/sum(counts) # Calculate the entropy of each column column.entropies = apply(counts,2,entropy) conditional.entropy = sum(probs*column.entropies) mutual.information = marginal.entropy - conditional.entropy return(mutual.information) } ## ------------------------------------------------------------------------ word.mutual.info(matrix(c(12,0,45,45),nrow=2)) ## ------------------------------------------------------------------------ # Count how many documents in each class do or don't contain a # word # Presumes that the data frame contains a column, named # "class.labels", which has the classes labels; may be more # than 2 classes # Inputs: dataframe of word counts with class labels (BoW), # word to check (word) # Outputs: table of counts word.class.indicator.counts <- function(BoW,word) { # What are the classes? classes <- levels(BoW[,"class.labels"]) # Prepare a matrix to store the counts, 1 row per class, 2 cols # (for present/absent) counts <- matrix(0,nrow=length(classes),ncol=2) # Name the rows to match the classes rownames(counts) = classes for (i in 1:length(classes)) { # Get a Boolean vector showing which rows belong to the class instance.rows = (BoW[,"class.labels"]==classes[i]) # sum of a boolean vector is the number of TRUEs n.class = sum(instance.rows) # Number of class instances present = sum(BoW[instance.rows,word] > 0) # present = Number of instances of class containing the word counts[i,1] = present counts[i,2] = n.class - present } return(counts) } ## ------------------------------------------------------------------------ word.class.indicator.counts(nyt.frame,"paint") ## ------------------------------------------------------------------------ word.mutual.info(word.class.indicator.counts(nyt.frame,"paint")) ## ------------------------------------------------------------------------ # Calculate realized and expected information of word indicators # for classes # Assumes: one column of the data is named "class.labels" # Inputs: data frame of word counts with class labels # Calls: word.class.indicator.counts(), word.realized.info(), # word.mutual.info() # Output: two-column matrix giving the reduction in class entropy # when a word is present, and the expected reduction from # checking the word info.bows <- function(BoW) { lexicon <- colnames(BoW) # One of these columns will be class.labels, that's not a # lexical item lexicon <- setdiff(lexicon,"class.labels") vocab.size = length(lexicon) word.infos <- matrix(0,nrow=vocab.size,ncol=2) # Name the rows so we know what we're talking about rownames(word.infos) = lexicon for (i in 1:vocab.size) { counts <- word.class.indicator.counts(BoW,lexicon[i]) word.infos[i,1] = word.realized.info(counts) word.infos[i,2] = word.mutual.info(counts) } return(word.infos) } ## ----echo=FALSE---------------------------------------------------------- info.matrix <- info.bows(nyt.frame) ## ------------------------------------------------------------------------ art.painting.indicators <- matrix(c(22,25,2,8,0,8,0,37),byrow=TRUE,nrow=2) word.mutual.info(art.painting.indicators) ## ------------------------------------------------------------------------ ape = table(nyt.frame[,"art"]>0,nyt.frame[,"painting"]>0, nyt.frame[,"evening"]>0, dnn=c("art","painting","evening")) ## ------------------------------------------------------------------------ ape[2,2,1] ## ------------------------------------------------------------------------ ape = table(nyt.frame[,"art"]>0,nyt.frame[,"painting"]>0, nyt.frame[,"evening"]>0, dnn=c("art","painting","evening")) ape ## ------------------------------------------------------------------------ as.vector(ape) ## ------------------------------------------------------------------------ entropy(as.vector(ape)) ## ------------------------------------------------------------------------ # Create a multi-dimensional table from given columns of a # data-frame # Inputs: frame, vector of column numbers or names # Outputs: multidimensional contingency table columns.to.table <- function(frame,colnums) { my.factors = c() for (i in colnums) { # Create commands to pick out individual columns, but don't # evaluate them yet my.factors = c(my.factors, substitute(frame[,i],list(i=i))) } # paste those commands together col.string=paste(my.factors, collapse=", ") # Name the dimensions of the table for comprehensibility if (is.numeric(colnums)) { # if we gave column numbers, get names from the frame table.names = colnames(frame)[colnums] } else { # if we gave column names, use them table.names = colnums } # Encase the column names in quotation marks to make sure they # stay names and R doesn't try to evaluate them table.string = paste('"',table.names,'"',collapse=",") # paste them together table.string = paste("c(",table.string,")",collapse=",") # Assemble what we wish we could type at the command line expr = paste("table(", col.string, ", dnn=", table.string, ")", collapse="") # execute it # parse() takes a string and parses it but doesn't evaluate it # eval() actually substitutes in values and executes commands return(eval(parse(text=expr))) } ## ------------------------------------------------------------------------ nyt.indicators = data.frame(class.labels=nyt.frame[,1], nyt.frame[,-1]>0) ## ------------------------------------------------------------------------ columns.to.table(nyt.indicators,c("class.labels")) columns.to.table(nyt.indicators,c("class.labels","art")) columns.to.table(nyt.indicators,c("art","painting","evening")) ## ------------------------------------------------------------------------ # Calculate the joint entropy of given columns in a data frame # Inputs: frame, vector of column numbers or names # Calls: columns.to.table(), entropy() # Output: the joint entropy of the desired features, in bits jt.entropy.columns = function(frame, colnums) { tabulations = columns.to.table(frame, colnums) H = entropy(as.vector(tabulations)) return(H) } ## ------------------------------------------------------------------------ jt.entropy.columns(nyt.indicators,c("art","painting","evening")) ## ------------------------------------------------------------------------ # Compute the information in multiple features about the outcome # Inputs: data frame, vector of feature numbers, # number of target feature (optional, default=1) # Calls: jt.entropy.columns # Output: mutual information in bits info.in.multi.columns = function(frame, feature.cols, target.col=1) { H.target = jt.entropy.columns(frame,target.col) H.features = jt.entropy.columns(frame,feature.cols) H.joint = jt.entropy.columns(frame,c(target.col,feature.cols)) return(H.target + H.features - H.joint) } ## ------------------------------------------------------------------------ info.in.multi.columns(nyt.indicators,244) info.in.multi.columns(nyt.indicators,2770) ## ------------------------------------------------------------------------ info.in.multi.columns(nyt.indicators,c(244,2770)) ## ------------------------------------------------------------------------ # Information about target after adding a new column to existing # set # Inputs: new column, vector of old columns, data frame, # target column (default 1) # Calls: info.in.multi.columns() # Output: new mutual information, in bits info.in.extra.column <- function(new.col,old.cols,frame, target.col=1) { mi = info.in.multi.columns(frame,c(old.cols,new.col), target.col=target.col) return(mi) } ## ------------------------------------------------------------------------ # Identify the best column to add to an existing set # Inputs: data frame, currently-picked columns, # target column (default 1) # Calls: info.in.extra.column() # Output: index of the best feature best.next.column <- function(frame,old.cols,target.col=1) { # Which columns might we add? possible.cols = setdiff(1:ncol(frame),c(old.cols,target.col)) # How good are each of those columns? infos = sapply(possible.cols, info.in.extra.column, old.cols=old.cols, frame=frame,target.col=target.col) # which of these columns is biggest? best.possibility = which.max(infos) # what column of the original data frame is that? best.index = possible.cols[best.possibility] return(best.index) } ## ------------------------------------------------------------------------ # Identify the best q columns for a given target variable # Inputs: data frame, q, target column (default 1) # Calls: best.next.column() # Output: vector of column indices best.q.columns <- function(frame,q,target.col=1) { possible.cols = setdiff(1:ncol(frame),target.col) selected.cols = c() for (k in 1:q) { new.col = best.next.column(frame,selected.cols,target.col) selected.cols=c(selected.cols,new.col) } return(selected.cols) } ## ------------------------------------------------------------------------ best.7 = best.q.columns(nyt.indicators,7) colnames(nyt.indicators)[best.7] info.in.multi.columns(nyt.indicators,best.7)