###############################
# Solutions to Homework 2     #
# 36-350, Fall 2008           #
###############################

# Presumes: code from the first assignment and its solutions is available
# and loaded


# Nearest neighbor classifier for documents
  # Assign a new vector to the same class as the closest example vector which
  # has a class label
    # Recycles the query.by.similarity() function from the solutions to the last
    # problem set: find the document in the data frame closest to the given
    # document vector, and return the index of that document.  Then take the
    # corresponding element from the list of class labels.
    # Optionally does IDF weighting and Euclidean length scaling
# Inputs: document to be classified (in vector form),
#         data frame of bag-of-word example vectors,
#         vector of class labels for examples,
#         flag for whether to do IDF/Euclidean length normalization
# Presumes: a data frame has been created for bag-of-word vectors, column names
#           being words;
#           the document to be classified has been reduced to this form with the
#           same columns;
#           the class labels are correct and given in the same order as the rows
#           in the data frame
               # One could make the vector of labels an extra column of the
               # data frame, which would be more R-slick and keep them from
               # getting out of order
# Calls: idf.Euc.preproc(), nearest.points()
# Outputs: label guessed for first input,
#          index of its nearest neighbor (for diagnostic purposes)
my.nn <- function(new.vec,examples.frame,examples.labels,preproc=TRUE) {
  # Should check that examples.labels has one entry for each row of
  # examples.frame
    # In fact, should really require the user to name each row, and check
    # that the row names are the same!

  # Do we need to pre-process by applying IDF weights and scaling by length?
  if (preproc) {
   	processed.data = idf.Euc.preproc(new.vec,examples.frame)
  	new.vec = processed.data$new.vector
  	examples.frame = processed.data$reference.frame  }
  # Make the query vector into a matrix
  new.vec = as.matrix(new.vec)
  # Make it into a row matrix (if necessary)
  if (dim(new.vec)[1] != 1) {
  	new.vec = t(new.vec)
  }

  # Find the closest match between new.vec and examples.frame  nearest.neighbor = nearest.points(new.vec,examples.frame)$which
  # Get the label  neighbor.label = examples.labels[nearest.neighbor]  return(list(label=neighbor.label,nearest.neighbor = nearest.neighbor))}

# Do prototype classification
  # Assign new vectors to the class whose prototype vector is closest
    # Does the problem in two parts:
    # 1. computes the prototypes by taking means of the labeled examples
    # 2. passes that off to my.nn(), which already knows how to find the closest
    # match
# Inputs: document to be classified (in vector form),
#         data frame of bag-of-word example vectors,
#         vector of class labels for examples,
#         flag for IDF/Euclidean-length normalization
# Presumes: a data frame has been created for bag-of-word vectors, column names
#           being words;
#           the document to be classified has been reduced to this form with the
#           same columns;
#           the class labels are correct and given in the same order as the rows
#           in the data frame
               # One could make the vector of labels an extra column of the
               # data frame, which would be more R-slick and keep them from
               # getting out of order
# Calls: idf.Euc.preproc(), my.nn()
# Outputs: label guessed for first input
prototype.classifier <- function(new.vec,examples.frame,examples.labels,preproc=TRUE) {
  # Should check that examples.labels has one entry for each row of
  # examples.frame
    # In fact, should really require the user to name each row, and check
    # that the row names are the same!
  
  # Do the vector to be classified and the data frame need pre-processing?
  if (preproc) {
  	# If so, call the preprocessing function
  	processed.data = idf.Euc.preproc(new.vec,examples.frame)
  	new.vec = processed.data$new.vector
  	examples.frame = processed.data$reference.frame
  }
  
  # Make the prototypes
    # This uses the moderately tricky R function aggregate()
    # An alternative would be to write something like find.class.centers() below
  class.prototypes = aggregate(examples.frame,
                               list(class.labels=examples.labels),
                               mean)
    # This groups together all the rows in examples,frame corresponding to the
    # same value in examples.labels and then takes the means of the columns.
    # The output is made into a new data-frame with the first column named
    # "class.labels" and giving the labels; the other columns are the same as
    # in examples.frame.
  
  # Having created the prototypes, pass the problem off to my.nn()
    # We've already done the pre-processing here, so my.nn() doesn't need to
    # re-do it
    # We do need to split class.prototypes into its first column (the labels)
    # and the others (the numerical values expected by my.nn())
  label.set = class.prototypes$class.labels
  my.match = my.nn(new.vec,class.prototypes[,-1],label.set,preproc=FALSE)
  return(my.match$label)
}


# Group vectors by class labels and find the mean vector for each class
  # More efficient just to use the R aggregate() function, but this shows what's
  # actually going on
# Inputs: data frame of example vectors, vector of class labels
# Presumes: the rows in the data frame are in the same order as the labels
# Outputs: matrix with one row for each observed label, giving the mean of the
#          columns for vectors with that label
find.class.centers <- function(examples,examples.labels) {
  # Check what labels we actually have
  label.set = sort(unique(examples.labels))
  # Make a matrix to store the prototypes
    # Each class gets its own row, the columns being the same as in the examples
    # data frame
    # Name each row after the correspond class
      # Note that "prototype" (without the "S") is an R reserved keyword
      # lengthening our name to make it less likely that typos will lead to 
      # bizarre bugs
  class.prototypes = matrix(0,nrow=length(label.set),ncol=ncol(examples),
     dimnames = list(label.set,colnames(examples)))
  # For each label, take a separate mean of the example vectors
  for (label in label.set) {
  	# Which rows are exemplars of that label?
  	exemplars = (examples.labels == label)
  	# Calculate a mean for each column, using only those rows
  	  # Note that naming the rows in class.prototypes means that they can be
  	  # addressed using their names --- we don't need numerical indices
  	  # Need to check whether there is more than one exemplar, since a nice R
  	  # trick only works if there is > 1 row
  	if (sum(exemplars) > 1) {
  		class.prototypes[label,] = apply(examples[exemplars,],2,mean)
  		# This complains if there is only one row
  	} else {
 		class.prototypes[label,] = examples[exemplars,]
  		# This just takes a subset
  		  # N.B. at least one component of exemplars must be TRUE (why?)
  	}
  }
  return(class.prototypes)
}

# Estimate error rate of a classifier by the leave-one-out method
  # Each vector is, in turn, omitted from the training set, and then used
  # as the test vector; the error rate is the proportion mis-classified
  # Actual classification is done by either my.nn() or prototype.classifier()
  # To "vectorize" the calculation, the function creates an internal function which takes a row
  # number as an argument and tries to classify the vector in that row using the others.  This
  # internal function can then be applied to the numbers from 1 to n via sapply().
    # This is a common trick for avoiding explicit loops in R, which are slow.
# Inputs: data-frame of training vectors,
#         vector of class labels for training vectors
#         indicator of the method (allowed values are "nn" and "prototypes")
#         flag for IDF/Euclidean length preprocessing (default TRUE, but FALSE handy to debug)
# Presumes: training vectors have been put into a single data frame with common
#           columns;
#           order of rows in that frame matches order of labels
# Calls: my.nn(), prototype.classifier()
# Output: error rate (as a fraction)
#         list of mis-classified training vectors (helps debugging and evaluation)
leave.one.out.error <- function(training.vectors,training.labels,method="nn",preproc=TRUE) {
	# Ideally, should check that training.vectors has as many rows as
	# training.labels has entries
	
	# Need to refer back to the number of training examples repeatedly
	n.examples = nrow(training.vectors)
	
	# Make a function which leaves one training example out and tries to classify
	# it
	  # Argument is the row number of the example to leave out
	  # Returned value is Boolean, TRUE if the classifier is in error, FALSE if
	  # it is right
	LOO <- function(i) {
		# Extract the left-out vector and its label
		q = training.vectors[i,]
		true.label = training.labels[i]
		# Make versions of the data leaving that out
		LOO.frame = training.vectors[-i,]
		LOO.labels = training.labels[-i]
		# Use the R switch() function to chose an actual classifier, based on the "method" argument
		  # See help(switch) for the syntax.
		  # The advantages of switch over a set of if() statements:
		    # 1. Cleaner code
		    # 2. Easier to modify later; if code up a new classifier method we can just add another
		    #    case to the switch.
		guess <- switch(method,
		                nn=my.nn(q,LOO.frame,LOO.labels,preproc)$label,
		                prototypes=prototype.classifier(q,LOO.frame,LOO.labels,preproc),
		                stop("Unknown classifier method",method)
		                )
		return((guess != true.label))
	}
	# Apply the leave-out function to the integers from 1 to n.examples, getting back a Boolean
	# vector
	classifier.errors = sapply(1:n.examples,LOO)
	# The sum of a Boolean vector is the number of TRUE entries, so the next line gives the
	# fraction of errors
	error.rate = sum(classifier.errors)/n.examples
	# Return the error rate and the set of indices on which there were errors
	  # the latter helps with debugging the code, and with understanding why it went wrong
	return(list(error.rate=error.rate,error.indices=which(classifier.errors)))
}


# Pre-process a query vector and a data frame with IDF weights and Euclidean
# length normalization
  # IDF weights are taken solely from the frame, not including the query vector
# Inputs: new vector to classify/match (q),
#         data frame (f) of vectors
# Presumes: q and f have the same column names
# Calls: get.idf.weight(), scale.cols(), div.by.euc.length(),
# Outputs: list whose components are the standardized vector and data-frame
idf.Euc.preproc <- function(q,f) {
  # Ideally, check here that the two arguments agree on their column names
  
  # Scale the rows by Euclidean length
  f = div.by.euc.length(f)
  # Ditto the query
  q = q/sqrt(sum(q^2))

  # Do IDF scaling on the targets AND the query
    # The function idf.weight calculates the weights and re-scales the
    # data-frame but we need the weights to re-scale the query as well so we
    # write our own function get.idf.weights() --- see below
  # Get the weights
  idf = get.idf.weights(f)
  # Scale the columns in the frame, modifying in place
  f = scale.cols(f,idf)
  # Scale the columns in the query vector too
  q = q * idf

  # Return the 
  return(list(new.vector=q,reference.frame=f))
}
# Return the vector of IDF weights from a data frame  # The core of idf.weight(), but leaving out the actual re-scaling# Input: data frame# Output: weight vectorget.idf.weights <- function(x) {  doc.freq <- colSums(x>0)  doc.freq[doc.freq == 0] <- 1  w <- log(nrow(x)/doc.freq)  return(w)}