# Functions for document retrieval and simple
# similarity searching
# Modified from code written by Tom Minka

# Read in a document and turn it into a vector
# of word-instances
  # by default, treats everything before the first
  # blank line as a header and remove it
# Input: filename, flag for header removal
# Calls: strip.text
# Output: a vector of character strings, giving
#         the words in order
read.doc <- function(fname,remove.header=T) {
  txt <- readLines(fname)
  if(remove.header) {
    # remove header
    i <- which(txt == "")
    if(length(i) > 0) {
      txt <- txt[-(1:i[1])]
    }
  }
  strip.text(txt)
}

# Turn a string into a vector of words
   # for comparability across bags of words,
   # also strip out punctuation and numbers, and
   # shift all letters into lower case
# Input: character string
# Output: vector of words (character strings)
strip.text <- function(txt) {
  # remove apostrophes
  txt <- gsub("'","",txt)
  # convert to lowercase
  txt <- tolower(txt)
  # change other non-alphanumeric characters to spaces
  txt <- gsub("[^a-z0-9]"," ",txt)
  # change digits to #
  txt <- gsub("[0-9]+","#",txt)
  # split and make one vector
  txt <- unlist(strsplit(txt," "))
  # remove empty words
  txt <- txt[txt != ""]
  return(txt)
}

# Rescale the columns of a data frame or array by
# a given weight vector
# Input: arrray, weight vector
# Output: scaled array
scale.cols <- function(x,s) {
  return(t(apply(x,1,function(x){x*s})))
}

# Rescale rows by a given weight vec<tor
# Input: array, weight vector
# Output: scaled array
scale.rows <- function(x,s) {
	return(apply(x,2,function(x){x*s}))
}

# Compute inverse document frequency weights and
# rescale a data frame
# Input: data frame
# Calls: scale.cols
# Output: scaled data-frame
idf.weight <- function(x) {
  # IDF weighting
  doc.freq <- colSums(x>0)
  doc.freq[doc.freq == 0] <- 1
  w <- log(nrow(x)/doc.freq)
  return(scale.cols(x,w))
}

# Normalize vectors by the sum of their entries
  # Input assumed to be a set of vectors in array
  # form, one vector per row
  # QUESTION: What is the 1e-16 doing here?
# Input: matrix/data frame/array
# Calls: scale.rows()
# Output: matrix/data frame/array
div.by.sum <- function(x) {
  scale.rows(x,1/(rowSums(x)+1e-16))
}

# Normalize vectors by their Euclidean length
  # Input assumed to be a set of vectors in array
  # form, one vector per row
  # QUESTION: What is the 1e-16 doing here?
# Input: array
# Calls: scale.rows()
# Output: array
div.by.euc.length <- function(x) {
  scale.rows(x,1/sqrt(rowSums(x^2)+1e-16))
}


# Remove columns from a ragged array which only
# appear in one row
# Input: Ragged array (vectors with named columns)
# Output: Ragged array, with columns appearing in only one vector deleted
remove.singletons.ragged <- function(x) {
	# Collect all the column names, WITH repetition
  col.names <- c()
  for(i in 1:length(x)) {
    col.names <- c(col.names, names(x[[i]]))
  }
  # See how often each name appears
  count <- table(col.names)
  # Loop over vectors and keep only the columns
  # which show up more than once
  for(i in 1:length(x)) {
    not.single <- (count[names(x[[i]])] > 1)
    x[[i]] <- x[[i]][not.single]
  }
  return(x)
}


# Standardize a ragged array so all vectors
# have the same length and ordering, supplying
# NAs for missing values
# Input: a list of vectors with named columns
# Output: a standardized list of vectors with named columns
standardize.ragged <- function(x) {
  # Keep track of all the column names from all
  # the vectors in a single vector
  col.names <- c()
  # Get the union of column names by iterating
  # over the vectors - using setdiff is faster
  # than taking unique of the concatenation (the)
  # more obvious approach
  for(i in 1:length(x)) {
    col.names <- c(col.names, setdiff(names(x[[i]]),col.names))
  }
  # put the column names in order, for greater
  # comprehensibility
  col.names <- sort(col.names)
  # Now loop over the vectors again, putting
  # them in order and filling them out
  # Note that x[[y]] returns NA if y is not the
  # name of a column in x
  for (i in 1:length(x)) {
    x[[i]] <- x[[i]][col.names]
    # Make sure the names are right
    names(x[[i]]) <- col.names
  }
  return(x)
}

# Turn a list of bag-of-words vectors into a 
# unified data frame, one row per bag of words
# Input: list of BoW vectors (x),
#   list of row names (row.names, optional),
#   flag for whether singletons should be removed,
#   flag for whether words missing in a document
#     should be coded 0
# Output: data frame, columns named by the words
#   and rows matching documents
make.BoW.frame <- function(x,row.names,remove.singletons=TRUE,absent.is.zero=TRUE) {
  # Should we remove one-time-only words?
  if (remove.singletons) {
  	y <- remove.singletons.ragged(x)
  } else {
  	y <- x
  }
  # Standardize the column names
  y <- standardize.ragged(y)
  # Transform the list into an array
     # There are probably slicker ways to do this
  z = y[[1]] # Start with the first row
  if (length(y) > 1) { # More than one row?
    for (i in 2:length(y)) {
  	   z = rbind(z,y[[i]],deparse.level=0) # then stack them
  	 }
  }
  # Make the data frame
     # use row names if provided
  if(missing(row.names)) {
	  BoW.frame <- data.frame(z)
  } else {
  	BoW.frame <- data.frame(z,row.names=row.names)
  }
  if (absent.is.zero) {
  	# The standardize.ragged function codes
  	# missing words as "NA"; replace those with
  	# zeroes to simplify calculation
  	BoW.frame <- apply(BoW.frame,2,function(x){ifelse(is.na(x),0,x)})
  	   # NB this x isn't our original argument!
  }
  return(BoW.frame)
}

# Produce a distance matrix from a data frame
  # Assumes rows in the data frame (or other array)
  # are vectors
  # By default uses Euclidean distance but could
  # use other functions as well
  # cf. the built-in function dist()
# Input: array, optional distance function
# Calls: sq.Euc.dist()
# Output: matrix of distances
distances <- function(x,fun) {
  # Use Euclidean distance by default
  if (missing(fun)) {
  	return(sqrt(sq.Euc.dist(x)))
  }
  # otherwise, run the function fun over all
  # combinations of rows
  else {
  	# make a new array
    n <- nrow(x)
    d <- array(NA,c(n,n),list(rownames(x),rownames(x))) #preserve row-names, but also make them column names
    # iterate over row-pair combinations
    for(i in 1:n) {
      for(j in 1:n) {
      	  # fill the entries of the array
        d[i,j] <- fun(x[i,],x[j,])
      }
    }
    # we're done
    return(d)
  }
}

# calculate the squared Euclidean distances between
# two sets of vectors
# specifically, d[i,j] is the squared distance
# from x[i,] to y[j,]
# Input: vectors in matrix form (one per row),
#        second set of vectors ditto (if missing
#          assumed equal to first)
# Output: matrix of distances
sq.Euc.dist <- function(x,y=x) {
  x <- as.matrix(x)
  y <- as.matrix(y)
  nr=nrow(x)
  nc=nrow(y)
  x2 <- rowSums(x^2)
  xsq = matrix(x2,nrow=nr,ncol=nc)
  y2 <- rowSums(y^2)
  ysq = matrix(y2,nrow=nr,ncol=nc,byrow=TRUE)
  xy = x %*% t(y)
  d = xsq + ysq - 2*xy
  if(identical(x,y)) diag(d) = 0
  d[which(d < 0)] = 0
  return(d)
}

# For each vector (row) in matrix A, return the
# index of and the distance to the index of the
# closest point to the matrix B
  # If the matrix B is omitted, then it's assumed
  # to be A, but no point is allowed to be its own
  # own closest match
  # The user can supply a pre-computed distance
  # matrix as an optional argument
# Input: matrix A, matrix B (optional),
  # matrix of distances between them (optional)
# Output: list of vectors, one giving the
  # indices, the other giving the distances
nearest.points <- function(a,b=a,d=sqrt(sq.Euc.dist(a,b))) {
  # "allocate" a vector, giving the distances to
  # the best matches
  b.dist = numeric(nrow(a))

  if (identical(a,b)) {
  	diag(d) = Inf
  }
  b.which = apply(d,1,which.min)
  for (i in 1:nrow(a)) {
  	b.dist[i] = d[i,b.which[i]]
  }
  return(list(which=b.which,dist=b.dist))
}