# Code to accompany Lecture 6 # Creating multi-dimensional contingency tables and calculating # information from them # See Lecture 6 for more # Relies on functions from Lecture 5 ########## IMPORTANT ########## # Must source lecture-05.R # ############################### # Create a multi-dimensional table from given columns of a # data-frame # Inputs: frame, vector of column numbers or names # Outputs: multidimensional contingency table columns.to.table <- function(frame,colnums) { my.factors = c() for (i in colnums) { # Create commands to pick out individual columns, but don't # evaluate them yet my.factors = c(my.factors, substitute(frame[,i],list(i=i))) } # paste those commands together col.string=paste(my.factors, collapse=", ") # Name the dimensions of the table for comprehensibility if (is.numeric(colnums)) { # if we gave column numbers, get names from the frame table.names = colnames(frame)[colnums] } else { # if we gave column names, use them table.names = colnums } # Encase the column names in quotation marks to make sure they # stay names and R doesn't try to evaluate them table.string = paste('"',table.names,'"',collapse=",") # paste them together table.string = paste("c(",table.string,")",collapse=",") # Assemble what we wish we could type at the command line expr = paste("table(", col.string, ", dnn=", table.string, ")", collapse="") # execute it # parse() takes a string and parses it but doesn't evaluate it # eval() actually substitutes in values and executes commands return(eval(parse(text=expr))) } # Calculate the joint entropy of given columns in a data frame # Inputs: frame, vector of column numbers or names # Calls: columns.to.table(), entropy() # Output: the joint entropy of the desired features, in bits jt.entropy.columns = function(frame, colnums) { tabulations = columns.to.table(frame, colnums) H = entropy(as.vector(tabulations)) return(H) } # Compute the information in multiple features about the outcome # Inputs: data frame, vector of feature numbers, # number of target feature (optional, default=1) # Calls: jt.entropy.columns # Output: mutual information in bits info.in.multi.columns = function(frame, feature.cols, target.col=1) { H.target = jt.entropy.columns(frame,target.col) H.features = jt.entropy.columns(frame,feature.cols) H.joint = jt.entropy.columns(frame,c(target.col,feature.cols)) return(H.target + H.features - H.joint) } # Information about target after adding a new column to existing # set # Inputs: new column, vector of old columns, data frame, # target column (default 1) # Calls: info.in.multi.columns() # Output: new mutual information, in bits info.in.extra.column <- function(new.col,old.cols,frame, target.col=1) { mi = info.in.multi.columns(frame,c(old.cols,new.col), target.col=target.col) return(mi) } # Identify the best column to add to an existing set # Inputs: data frame, currently-picked columns, # target column (default 1) # Calls: info.in.extra.column() # Output: index of the best feature best.next.column <- function(frame,old.cols,target.col=1) { # Which columns might we add? possible.cols = setdiff(1:ncol(frame),c(old.cols,target.col)) # How good are each of those columns? infos = sapply(possible.cols, info.in.extra.column, old.cols=old.cols, frame=frame,target.col=target.col) # which of these columns is biggest? best.possibility = which.max(infos) # what column of the original data frame is that? best.index = possible.cols[best.possibility] return(best.index) } # Identify the best q columns for a given target variable # Inputs: data frame, q, target column (default 1) # Calls: best.next.column() # Output: vector of column indices best.q.columns <- function(frame,q,target.col=1) { possible.cols = setdiff(1:ncol(frame),target.col) selected.cols = c() for (k in 1:q) { new.col = best.next.column(frame,selected.cols,target.col) selected.cols=c(selected.cols,new.col) } return(selected.cols) }