# Code to accompany Lecture 5
# Information-theoretic calculations on data frames and contingency tables
# See Lecture 5 for background and examples
# Calculate the entropy of a vector of counts or proportions
# Inputs: Vector of numbers
# Output: Entropy (in bits)
entropy <- function(p) {
# Assumes: p is a numeric vector
if (sum(p) == 0) {
return(0) # Case shows up when calculating conditional
# entropies
}
p <- p/sum(p) # Normalize so it sums to 1
p <- p[p > 0] # Discard zero entries (because 0 log 0 = 0)
H = -sum(p*log(p,base=2))
return(H)
}
# Get the expected information a word's indicator gives about a
# document's class
# Inputs: array of indicator counts
# Calls: entropy()
# Outputs: mutual information
word.mutual.info <- function(counts) {
# Assumes: counts is a numeric matrix
# get the marginal entropy of the classes (rows) C
marginal.entropy = entropy(rowSums(counts))
# Get the probability of each value of X
probs <- colSums(counts)/sum(counts)
# Calculate the entropy of each column
column.entropies = apply(counts,2,entropy)
conditional.entropy = sum(probs*column.entropies)
mutual.information = marginal.entropy - conditional.entropy
return(mutual.information)
}
# Count how many documents in each class do or don't contain a
# word
# Presumes that the data frame contains a column, named
# "class.labels", which has the classes labels; may be more
# than 2 classes
# Inputs: dataframe of word counts with class labels (BoW),
# word to check (word)
# Outputs: table of counts
word.class.indicator.counts <- function(BoW,word) {
# What are the classes?
classes <- levels(BoW[,"class.labels"])
# Prepare a matrix to store the counts, 1 row per class, 2 cols
# (for present/absent)
counts <- matrix(0,nrow=length(classes),ncol=2)
# Name the rows to match the classes
rownames(counts) = classes
for (i in 1:length(classes)) {
# Get a Boolean vector showing which rows belong to the class
instance.rows = (BoW[,"class.labels"]==classes[i])
# sum of a boolean vector is the number of TRUEs
n.class = sum(instance.rows) # Number of class instances
present = sum(BoW[instance.rows,word] > 0)
# present = Number of instances of class containing the word
counts[i,1] = present
counts[i,2] = n.class - present
}
return(counts)
}
# Calculate realized and expected information of word indicators
# for classes
# Assumes: one column of the data is named "class.labels"
# Inputs: data frame of word counts with class labels
# Calls: word.class.indicator.counts(), word.realized.info(),
# word.mutual.info()
# Output: two-column matrix giving the reduction in class entropy
# when a word is present, and the expected reduction from
# checking the word
infos.bow <- function(BoW) {
lexicon <- colnames(BoW)
# One of these columns will be class.labels, that's not a
# lexical item
lexicon <- setdiff(lexicon,"class.labels")
vocab.size = length(lexicon)
word.infos <- matrix(0,nrow=vocab.size,ncol=2)
# Name the rows so we know what we're talking about
rownames(word.infos) = lexicon
for (i in 1:vocab.size) {
counts <- word.class.indicator.counts(BoW,lexicon[i])
word.infos[i,1] = word.realized.info(counts)
word.infos[i,2] = word.mutual.info(counts)
}
return(word.infos)
}