############################### # Solutions to Homework 2 # # 36-350, Fall 2008 # ############################### # Presumes: code from the first assignment and its solutions is available # and loaded # Nearest neighbor classifier for documents # Assign a new vector to the same class as the closest example vector which # has a class label # Recycles the query.by.similarity() function from the solutions to the last # problem set: find the document in the data frame closest to the given # document vector, and return the index of that document. Then take the # corresponding element from the list of class labels. # Optionally does IDF weighting and Euclidean length scaling # Inputs: document to be classified (in vector form), # data frame of bag-of-word example vectors, # vector of class labels for examples, # flag for whether to do IDF/Euclidean length normalization # Presumes: a data frame has been created for bag-of-word vectors, column names # being words; # the document to be classified has been reduced to this form with the # same columns; # the class labels are correct and given in the same order as the rows # in the data frame # One could make the vector of labels an extra column of the # data frame, which would be more R-slick and keep them from # getting out of order # Calls: idf.Euc.preproc(), nearest.points() # Outputs: label guessed for first input, # index of its nearest neighbor (for diagnostic purposes) my.nn <- function(new.vec,examples.frame,examples.labels,preproc=TRUE) { # Should check that examples.labels has one entry for each row of # examples.frame # In fact, should really require the user to name each row, and check # that the row names are the same! # Do we need to pre-process by applying IDF weights and scaling by length? if (preproc) { processed.data = idf.Euc.preproc(new.vec,examples.frame) new.vec = processed.data$new.vector examples.frame = processed.data$reference.frame } # Make the query vector into a matrix new.vec = as.matrix(new.vec) # Make it into a row matrix (if necessary) if (dim(new.vec)[1] != 1) { new.vec = t(new.vec) } # Find the closest match between new.vec and examples.frame nearest.neighbor = nearest.points(new.vec,examples.frame)$which # Get the label neighbor.label = examples.labels[nearest.neighbor] return(list(label=neighbor.label,nearest.neighbor = nearest.neighbor)) } # Do prototype classification # Assign new vectors to the class whose prototype vector is closest # Does the problem in two parts: # 1. computes the prototypes by taking means of the labeled examples # 2. passes that off to my.nn(), which already knows how to find the closest # match # Inputs: document to be classified (in vector form), # data frame of bag-of-word example vectors, # vector of class labels for examples, # flag for IDF/Euclidean-length normalization # Presumes: a data frame has been created for bag-of-word vectors, column names # being words; # the document to be classified has been reduced to this form with the # same columns; # the class labels are correct and given in the same order as the rows # in the data frame # One could make the vector of labels an extra column of the # data frame, which would be more R-slick and keep them from # getting out of order # Calls: idf.Euc.preproc(), my.nn() # Outputs: label guessed for first input prototype.classifier <- function(new.vec,examples.frame,examples.labels,preproc=TRUE) { # Should check that examples.labels has one entry for each row of # examples.frame # In fact, should really require the user to name each row, and check # that the row names are the same! # Do the vector to be classified and the data frame need pre-processing? if (preproc) { # If so, call the preprocessing function processed.data = idf.Euc.preproc(new.vec,examples.frame) new.vec = processed.data$new.vector examples.frame = processed.data$reference.frame } # Make the prototypes # This uses the moderately tricky R function aggregate() # An alternative would be to write something like find.class.centers() below class.prototypes = aggregate(examples.frame, list(class.labels=examples.labels), mean) # This groups together all the rows in examples,frame corresponding to the # same value in examples.labels and then takes the means of the columns. # The output is made into a new data-frame with the first column named # "class.labels" and giving the labels; the other columns are the same as # in examples.frame. # Having created the prototypes, pass the problem off to my.nn() # We've already done the pre-processing here, so my.nn() doesn't need to # re-do it # We do need to split class.prototypes into its first column (the labels) # and the others (the numerical values expected by my.nn()) label.set = class.prototypes$class.labels my.match = my.nn(new.vec,class.prototypes[,-1],label.set,preproc=FALSE) return(my.match$label) } # Group vectors by class labels and find the mean vector for each class # More efficient just to use the R aggregate() function, but this shows what's # actually going on # Inputs: data frame of example vectors, vector of class labels # Presumes: the rows in the data frame are in the same order as the labels # Outputs: matrix with one row for each observed label, giving the mean of the # columns for vectors with that label find.class.centers <- function(examples,examples.labels) { # Check what labels we actually have label.set = sort(unique(examples.labels)) # Make a matrix to store the prototypes # Each class gets its own row, the columns being the same as in the examples # data frame # Name each row after the correspond class # Note that "prototype" (without the "S") is an R reserved keyword # lengthening our name to make it less likely that typos will lead to # bizarre bugs class.prototypes = matrix(0,nrow=length(label.set),ncol=ncol(examples), dimnames = list(label.set,colnames(examples))) # For each label, take a separate mean of the example vectors for (label in label.set) { # Which rows are exemplars of that label? exemplars = (examples.labels == label) # Calculate a mean for each column, using only those rows # Note that naming the rows in class.prototypes means that they can be # addressed using their names --- we don't need numerical indices # Need to check whether there is more than one exemplar, since a nice R # trick only works if there is > 1 row if (sum(exemplars) > 1) { class.prototypes[label,] = apply(examples[exemplars,],2,mean) # This complains if there is only one row } else { class.prototypes[label,] = examples[exemplars,] # This just takes a subset # N.B. at least one component of exemplars must be TRUE (why?) } } return(class.prototypes) } # Estimate error rate of a classifier by the leave-one-out method # Each vector is, in turn, omitted from the training set, and then used # as the test vector; the error rate is the proportion mis-classified # Actual classification is done by either my.nn() or prototype.classifier() # To "vectorize" the calculation, the function creates an internal function which takes a row # number as an argument and tries to classify the vector in that row using the others. This # internal function can then be applied to the numbers from 1 to n via sapply(). # This is a common trick for avoiding explicit loops in R, which are slow. # Inputs: data-frame of training vectors, # vector of class labels for training vectors # indicator of the method (allowed values are "nn" and "prototypes") # flag for IDF/Euclidean length preprocessing (default TRUE, but FALSE handy to debug) # Presumes: training vectors have been put into a single data frame with common # columns; # order of rows in that frame matches order of labels # Calls: my.nn(), prototype.classifier() # Output: error rate (as a fraction) # list of mis-classified training vectors (helps debugging and evaluation) leave.one.out.error <- function(training.vectors,training.labels,method="nn",preproc=TRUE) { # Ideally, should check that training.vectors has as many rows as # training.labels has entries # Need to refer back to the number of training examples repeatedly n.examples = nrow(training.vectors) # Make a function which leaves one training example out and tries to classify # it # Argument is the row number of the example to leave out # Returned value is Boolean, TRUE if the classifier is in error, FALSE if # it is right LOO <- function(i) { # Extract the left-out vector and its label q = training.vectors[i,] true.label = training.labels[i] # Make versions of the data leaving that out LOO.frame = training.vectors[-i,] LOO.labels = training.labels[-i] # Use the R switch() function to chose an actual classifier, based on the "method" argument # See help(switch) for the syntax. # The advantages of switch over a set of if() statements: # 1. Cleaner code # 2. Easier to modify later; if code up a new classifier method we can just add another # case to the switch. guess <- switch(method, nn=my.nn(q,LOO.frame,LOO.labels,preproc)$label, prototypes=prototype.classifier(q,LOO.frame,LOO.labels,preproc), stop("Unknown classifier method",method) ) return((guess != true.label)) } # Apply the leave-out function to the integers from 1 to n.examples, getting back a Boolean # vector classifier.errors = sapply(1:n.examples,LOO) # The sum of a Boolean vector is the number of TRUE entries, so the next line gives the # fraction of errors error.rate = sum(classifier.errors)/n.examples # Return the error rate and the set of indices on which there were errors # the latter helps with debugging the code, and with understanding why it went wrong return(list(error.rate=error.rate,error.indices=which(classifier.errors))) } # Pre-process a query vector and a data frame with IDF weights and Euclidean # length normalization # IDF weights are taken solely from the frame, not including the query vector # Inputs: new vector to classify/match (q), # data frame (f) of vectors # Presumes: q and f have the same column names # Calls: get.idf.weight(), scale.cols(), div.by.euc.length(), # Outputs: list whose components are the standardized vector and data-frame idf.Euc.preproc <- function(q,f) { # Ideally, check here that the two arguments agree on their column names # Scale the rows by Euclidean length f = div.by.euc.length(f) # Ditto the query q = q/sqrt(sum(q^2)) # Do IDF scaling on the targets AND the query # The function idf.weight calculates the weights and re-scales the # data-frame but we need the weights to re-scale the query as well so we # write our own function get.idf.weights() --- see below # Get the weights idf = get.idf.weights(f) # Scale the columns in the frame, modifying in place f = scale.cols(f,idf) # Scale the columns in the query vector too q = q * idf # Return the return(list(new.vector=q,reference.frame=f)) } # Return the vector of IDF weights from a data frame # The core of idf.weight(), but leaving out the actual re-scaling # Input: data frame # Output: weight vector get.idf.weights <- function(x) { doc.freq <- colSums(x>0) doc.freq[doc.freq == 0] <- 1 w <- log(nrow(x)/doc.freq) return(w) }