# Code to accompany Lecture 5 # Information-theoretic calculations on data frames and contingency tables # See Lecture 5 for background and examples # Calculate the entropy of a vector of counts or proportions # Inputs: Vector of numbers # Output: Entropy (in bits) entropy <- function(p) { # Assumes: p is a numeric vector if (sum(p) == 0) { return(0) # Case shows up when calculating conditional # entropies } p <- p/sum(p) # Normalize so it sums to 1 p <- p[p > 0] # Discard zero entries (because 0 log 0 = 0) H = -sum(p*log(p,base=2)) return(H) } # Get the expected information a word's indicator gives about a # document's class # Inputs: array of indicator counts # Calls: entropy() # Outputs: mutual information word.mutual.info <- function(counts) { # Assumes: counts is a numeric matrix # get the marginal entropy of the classes (rows) C marginal.entropy = entropy(rowSums(counts)) # Get the probability of each value of X probs <- colSums(counts)/sum(counts) # Calculate the entropy of each column column.entropies = apply(counts,2,entropy) conditional.entropy = sum(probs*column.entropies) mutual.information = marginal.entropy - conditional.entropy return(mutual.information) } # Count how many documents in each class do or don't contain a # word # Presumes that the data frame contains a column, named # "class.labels", which has the classes labels; may be more # than 2 classes # Inputs: dataframe of word counts with class labels (BoW), # word to check (word) # Outputs: table of counts word.class.indicator.counts <- function(BoW,word) { # What are the classes? classes <- levels(BoW[,"class.labels"]) # Prepare a matrix to store the counts, 1 row per class, 2 cols # (for present/absent) counts <- matrix(0,nrow=length(classes),ncol=2) # Name the rows to match the classes rownames(counts) = classes for (i in 1:length(classes)) { # Get a Boolean vector showing which rows belong to the class instance.rows = (BoW[,"class.labels"]==classes[i]) # sum of a boolean vector is the number of TRUEs n.class = sum(instance.rows) # Number of class instances present = sum(BoW[instance.rows,word] > 0) # present = Number of instances of class containing the word counts[i,1] = present counts[i,2] = n.class - present } return(counts) } # Calculate realized and expected information of word indicators # for classes # Assumes: one column of the data is named "class.labels" # Inputs: data frame of word counts with class labels # Calls: word.class.indicator.counts(), word.realized.info(), # word.mutual.info() # Output: two-column matrix giving the reduction in class entropy # when a word is present, and the expected reduction from # checking the word infos.bow <- function(BoW) { lexicon <- colnames(BoW) # One of these columns will be class.labels, that's not a # lexical item lexicon <- setdiff(lexicon,"class.labels") vocab.size = length(lexicon) word.infos <- matrix(0,nrow=vocab.size,ncol=2) # Name the rows so we know what we're talking about rownames(word.infos) = lexicon for (i in 1:vocab.size) { counts <- word.class.indicator.counts(BoW,lexicon[i]) word.infos[i,1] = word.realized.info(counts) word.infos[i,2] = word.mutual.info(counts) } return(word.infos) }