###############################
# Solutions to Homework 2 #
# 36-350, Fall 2008 #
###############################
# Presumes: code from the first assignment and its solutions is available
# and loaded
# Nearest neighbor classifier for documents
# Assign a new vector to the same class as the closest example vector which
# has a class label
# Recycles the query.by.similarity() function from the solutions to the last
# problem set: find the document in the data frame closest to the given
# document vector, and return the index of that document. Then take the
# corresponding element from the list of class labels.
# Optionally does IDF weighting and Euclidean length scaling
# Inputs: document to be classified (in vector form),
# data frame of bag-of-word example vectors,
# vector of class labels for examples,
# flag for whether to do IDF/Euclidean length normalization
# Presumes: a data frame has been created for bag-of-word vectors, column names
# being words;
# the document to be classified has been reduced to this form with the
# same columns;
# the class labels are correct and given in the same order as the rows
# in the data frame
# One could make the vector of labels an extra column of the
# data frame, which would be more R-slick and keep them from
# getting out of order
# Calls: idf.Euc.preproc(), nearest.points()
# Outputs: label guessed for first input,
# index of its nearest neighbor (for diagnostic purposes)
my.nn <- function(new.vec,examples.frame,examples.labels,preproc=TRUE) {
# Should check that examples.labels has one entry for each row of
# examples.frame
# In fact, should really require the user to name each row, and check
# that the row names are the same!
# Do we need to pre-process by applying IDF weights and scaling by length?
if (preproc) {
processed.data = idf.Euc.preproc(new.vec,examples.frame)
new.vec = processed.data$new.vector
examples.frame = processed.data$reference.frame
}
# Make the query vector into a matrix
new.vec = as.matrix(new.vec)
# Make it into a row matrix (if necessary)
if (dim(new.vec)[1] != 1) {
new.vec = t(new.vec)
}
# Find the closest match between new.vec and examples.frame
nearest.neighbor = nearest.points(new.vec,examples.frame)$which
# Get the label
neighbor.label = examples.labels[nearest.neighbor]
return(list(label=neighbor.label,nearest.neighbor = nearest.neighbor))
}
# Do prototype classification
# Assign new vectors to the class whose prototype vector is closest
# Does the problem in two parts:
# 1. computes the prototypes by taking means of the labeled examples
# 2. passes that off to my.nn(), which already knows how to find the closest
# match
# Inputs: document to be classified (in vector form),
# data frame of bag-of-word example vectors,
# vector of class labels for examples,
# flag for IDF/Euclidean-length normalization
# Presumes: a data frame has been created for bag-of-word vectors, column names
# being words;
# the document to be classified has been reduced to this form with the
# same columns;
# the class labels are correct and given in the same order as the rows
# in the data frame
# One could make the vector of labels an extra column of the
# data frame, which would be more R-slick and keep them from
# getting out of order
# Calls: idf.Euc.preproc(), my.nn()
# Outputs: label guessed for first input
prototype.classifier <- function(new.vec,examples.frame,examples.labels,preproc=TRUE) {
# Should check that examples.labels has one entry for each row of
# examples.frame
# In fact, should really require the user to name each row, and check
# that the row names are the same!
# Do the vector to be classified and the data frame need pre-processing?
if (preproc) {
# If so, call the preprocessing function
processed.data = idf.Euc.preproc(new.vec,examples.frame)
new.vec = processed.data$new.vector
examples.frame = processed.data$reference.frame
}
# Make the prototypes
# This uses the moderately tricky R function aggregate()
# An alternative would be to write something like find.class.centers() below
class.prototypes = aggregate(examples.frame,
list(class.labels=examples.labels),
mean)
# This groups together all the rows in examples,frame corresponding to the
# same value in examples.labels and then takes the means of the columns.
# The output is made into a new data-frame with the first column named
# "class.labels" and giving the labels; the other columns are the same as
# in examples.frame.
# Having created the prototypes, pass the problem off to my.nn()
# We've already done the pre-processing here, so my.nn() doesn't need to
# re-do it
# We do need to split class.prototypes into its first column (the labels)
# and the others (the numerical values expected by my.nn())
label.set = class.prototypes$class.labels
my.match = my.nn(new.vec,class.prototypes[,-1],label.set,preproc=FALSE)
return(my.match$label)
}
# Group vectors by class labels and find the mean vector for each class
# More efficient just to use the R aggregate() function, but this shows what's
# actually going on
# Inputs: data frame of example vectors, vector of class labels
# Presumes: the rows in the data frame are in the same order as the labels
# Outputs: matrix with one row for each observed label, giving the mean of the
# columns for vectors with that label
find.class.centers <- function(examples,examples.labels) {
# Check what labels we actually have
label.set = sort(unique(examples.labels))
# Make a matrix to store the prototypes
# Each class gets its own row, the columns being the same as in the examples
# data frame
# Name each row after the correspond class
# Note that "prototype" (without the "S") is an R reserved keyword
# lengthening our name to make it less likely that typos will lead to
# bizarre bugs
class.prototypes = matrix(0,nrow=length(label.set),ncol=ncol(examples),
dimnames = list(label.set,colnames(examples)))
# For each label, take a separate mean of the example vectors
for (label in label.set) {
# Which rows are exemplars of that label?
exemplars = (examples.labels == label)
# Calculate a mean for each column, using only those rows
# Note that naming the rows in class.prototypes means that they can be
# addressed using their names --- we don't need numerical indices
# Need to check whether there is more than one exemplar, since a nice R
# trick only works if there is > 1 row
if (sum(exemplars) > 1) {
class.prototypes[label,] = apply(examples[exemplars,],2,mean)
# This complains if there is only one row
} else {
class.prototypes[label,] = examples[exemplars,]
# This just takes a subset
# N.B. at least one component of exemplars must be TRUE (why?)
}
}
return(class.prototypes)
}
# Estimate error rate of a classifier by the leave-one-out method
# Each vector is, in turn, omitted from the training set, and then used
# as the test vector; the error rate is the proportion mis-classified
# Actual classification is done by either my.nn() or prototype.classifier()
# To "vectorize" the calculation, the function creates an internal function which takes a row
# number as an argument and tries to classify the vector in that row using the others. This
# internal function can then be applied to the numbers from 1 to n via sapply().
# This is a common trick for avoiding explicit loops in R, which are slow.
# Inputs: data-frame of training vectors,
# vector of class labels for training vectors
# indicator of the method (allowed values are "nn" and "prototypes")
# flag for IDF/Euclidean length preprocessing (default TRUE, but FALSE handy to debug)
# Presumes: training vectors have been put into a single data frame with common
# columns;
# order of rows in that frame matches order of labels
# Calls: my.nn(), prototype.classifier()
# Output: error rate (as a fraction)
# list of mis-classified training vectors (helps debugging and evaluation)
leave.one.out.error <- function(training.vectors,training.labels,method="nn",preproc=TRUE) {
# Ideally, should check that training.vectors has as many rows as
# training.labels has entries
# Need to refer back to the number of training examples repeatedly
n.examples = nrow(training.vectors)
# Make a function which leaves one training example out and tries to classify
# it
# Argument is the row number of the example to leave out
# Returned value is Boolean, TRUE if the classifier is in error, FALSE if
# it is right
LOO <- function(i) {
# Extract the left-out vector and its label
q = training.vectors[i,]
true.label = training.labels[i]
# Make versions of the data leaving that out
LOO.frame = training.vectors[-i,]
LOO.labels = training.labels[-i]
# Use the R switch() function to chose an actual classifier, based on the "method" argument
# See help(switch) for the syntax.
# The advantages of switch over a set of if() statements:
# 1. Cleaner code
# 2. Easier to modify later; if code up a new classifier method we can just add another
# case to the switch.
guess <- switch(method,
nn=my.nn(q,LOO.frame,LOO.labels,preproc)$label,
prototypes=prototype.classifier(q,LOO.frame,LOO.labels,preproc),
stop("Unknown classifier method",method)
)
return((guess != true.label))
}
# Apply the leave-out function to the integers from 1 to n.examples, getting back a Boolean
# vector
classifier.errors = sapply(1:n.examples,LOO)
# The sum of a Boolean vector is the number of TRUE entries, so the next line gives the
# fraction of errors
error.rate = sum(classifier.errors)/n.examples
# Return the error rate and the set of indices on which there were errors
# the latter helps with debugging the code, and with understanding why it went wrong
return(list(error.rate=error.rate,error.indices=which(classifier.errors)))
}
# Pre-process a query vector and a data frame with IDF weights and Euclidean
# length normalization
# IDF weights are taken solely from the frame, not including the query vector
# Inputs: new vector to classify/match (q),
# data frame (f) of vectors
# Presumes: q and f have the same column names
# Calls: get.idf.weight(), scale.cols(), div.by.euc.length(),
# Outputs: list whose components are the standardized vector and data-frame
idf.Euc.preproc <- function(q,f) {
# Ideally, check here that the two arguments agree on their column names
# Scale the rows by Euclidean length
f = div.by.euc.length(f)
# Ditto the query
q = q/sqrt(sum(q^2))
# Do IDF scaling on the targets AND the query
# The function idf.weight calculates the weights and re-scales the
# data-frame but we need the weights to re-scale the query as well so we
# write our own function get.idf.weights() --- see below
# Get the weights
idf = get.idf.weights(f)
# Scale the columns in the frame, modifying in place
f = scale.cols(f,idf)
# Scale the columns in the query vector too
q = q * idf
# Return the
return(list(new.vector=q,reference.frame=f))
}
# Return the vector of IDF weights from a data frame
# The core of idf.weight(), but leaving out the actual re-scaling
# Input: data frame
# Output: weight vector
get.idf.weights <- function(x) {
doc.freq <- colSums(x>0)
doc.freq[doc.freq == 0] <- 1
w <- log(nrow(x)/doc.freq)
return(w)
}