# Functions for document retrieval and simple
# similarity searching
# Modified from code written by Tom Minka
# Read in a document and turn it into a vector
# of word-instances
# by default, treats everything before the first
# blank line as a header and remove it
# Input: filename, flag for header removal
# Calls: strip.text
# Output: a vector of character strings, giving
# the words in order
read.doc <- function(fname,remove.header=T) {
txt <- readLines(fname)
if(remove.header) {
# remove header
i <- which(txt == "")
if(length(i) > 0) {
txt <- txt[-(1:i[1])]
}
}
strip.text(txt)
}
# Turn a string into a vector of words
# for comparability across bags of words,
# also strip out punctuation and numbers, and
# shift all letters into lower case
# Input: character string
# Output: vector of words (character strings)
strip.text <- function(txt) {
# remove apostrophes
txt <- gsub("'","",txt)
# convert to lowercase
txt <- tolower(txt)
# change other non-alphanumeric characters to spaces
txt <- gsub("[^a-z0-9]"," ",txt)
# change digits to #
txt <- gsub("[0-9]+","#",txt)
# split and make one vector
txt <- unlist(strsplit(txt," "))
# remove empty words
txt <- txt[txt != ""]
return(txt)
}
# Rescale the columns of a data frame or array by
# a given weight vector
# Input: arrray, weight vector
# Output: scaled array
scale.cols <- function(x,s) {
return(t(apply(x,1,function(x){x*s})))
}
# Rescale rows by a given weight vec0)
doc.freq[doc.freq == 0] <- 1
w <- log(nrow(x)/doc.freq)
return(scale.cols(x,w))
}
# Normalize vectors by the sum of their entries
# Input assumed to be a set of vectors in array
# form, one vector per row
# QUESTION: What is the 1e-16 doing here?
# Input: matrix/data frame/array
# Calls: scale.rows()
# Output: matrix/data frame/array
div.by.sum <- function(x) {
scale.rows(x,1/(rowSums(x)+1e-16))
}
# Normalize vectors by their Euclidean length
# Input assumed to be a set of vectors in array
# form, one vector per row
# QUESTION: What is the 1e-16 doing here?
# Input: array
# Calls: scale.rows()
# Output: array
div.by.euc.length <- function(x) {
scale.rows(x,1/sqrt(rowSums(x^2)+1e-16))
}
# Remove columns from a ragged array which only
# appear in one row
# Input: Ragged array (vectors with named columns)
# Output: Ragged array, with columns appearing in only one vector deleted
remove.singletons.ragged <- function(x) {
# Collect all the column names, WITH repetition
col.names <- c()
for(i in 1:length(x)) {
col.names <- c(col.names, names(x[[i]]))
}
# See how often each name appears
count <- table(col.names)
# Loop over vectors and keep only the columns
# which show up more than once
for(i in 1:length(x)) {
not.single <- (count[names(x[[i]])] > 1)
x[[i]] <- x[[i]][not.single]
}
return(x)
}
# Standardize a ragged array so all vectors
# have the same length and ordering, supplying
# NAs for missing values
# Input: a list of vectors with named columns
# Output: a standardized list of vectors with named columns
standardize.ragged <- function(x) {
# Keep track of all the column names from all
# the vectors in a single vector
col.names <- c()
# Get the union of column names by iterating
# over the vectors - using setdiff is faster
# than taking unique of the concatenation (the)
# more obvious approach
for(i in 1:length(x)) {
col.names <- c(col.names, setdiff(names(x[[i]]),col.names))
}
# put the column names in order, for greater
# comprehensibility
col.names <- sort(col.names)
# Now loop over the vectors again, putting
# them in order and filling them out
# Note that x[[y]] returns NA if y is not the
# name of a column in x
for (i in 1:length(x)) {
x[[i]] <- x[[i]][col.names]
# Make sure the names are right
names(x[[i]]) <- col.names
}
return(x)
}
# Turn a list of bag-of-words vectors into a
# unified data frame, one row per bag of words
# Input: list of BoW vectors (x),
# list of row names (row.names, optional),
# flag for whether singletons should be removed,
# flag for whether words missing in a document
# should be coded 0
# Output: data frame, columns named by the words
# and rows matching documents
make.BoW.frame <- function(x,row.names,remove.singletons=TRUE,absent.is.zero=TRUE) {
# Should we remove one-time-only words?
if (remove.singletons) {
y <- remove.singletons.ragged(x)
} else {
y <- x
}
# Standardize the column names
y <- standardize.ragged(y)
# Transform the list into an array
# There are probably slicker ways to do this
z = y[[1]] # Start with the first row
if (length(y) > 1) { # More than one row?
for (i in 2:length(y)) {
z = rbind(z,y[[i]],deparse.level=0) # then stack them
}
}
# Make the data frame
# use row names if provided
if(missing(row.names)) {
BoW.frame <- data.frame(z)
} else {
BoW.frame <- data.frame(z,row.names=row.names)
}
if (absent.is.zero) {
# The standardize.ragged function codes
# missing words as "NA"; replace those with
# zeroes to simplify calculation
BoW.frame <- apply(BoW.frame,2,function(x){ifelse(is.na(x),0,x)})
# NB this x isn't our original argument!
}
return(BoW.frame)
}
# Produce a distance matrix from a data frame
# Assumes rows in the data frame (or other array)
# are vectors
# By default uses Euclidean distance but could
# use other functions as well
# cf. the built-in function dist()
# Input: array, optional distance function
# Calls: sq.Euc.dist()
# Output: matrix of distances
distances <- function(x,fun) {
# Use Euclidean distance by default
if (missing(fun)) {
return(sqrt(sq.Euc.dist(x)))
}
# otherwise, run the function fun over all
# combinations of rows
else {
# make a new array
n <- nrow(x)
d <- array(NA,c(n,n),list(rownames(x),rownames(x))) #preserve row-names, but also make them column names
# iterate over row-pair combinations
for(i in 1:n) {
for(j in 1:n) {
# fill the entries of the array
d[i,j] <- fun(x[i,],x[j,])
}
}
# we're done
return(d)
}
}
# calculate the squared Euclidean distances between
# two sets of vectors
# specifically, d[i,j] is the squared distance
# from x[i,] to y[j,]
# Input: vectors in matrix form (one per row),
# second set of vectors ditto (if missing
# assumed equal to first)
# Output: matrix of distances
sq.Euc.dist <- function(x,y=x) {
x <- as.matrix(x)
y <- as.matrix(y)
nr=nrow(x)
nc=nrow(y)
x2 <- rowSums(x^2)
xsq = matrix(x2,nrow=nr,ncol=nc)
y2 <- rowSums(y^2)
ysq = matrix(y2,nrow=nr,ncol=nc,byrow=TRUE)
xy = x %*% t(y)
d = xsq + ysq - 2*xy
if(identical(x,y)) diag(d) = 0
d[which(d < 0)] = 0
return(d)
}
# For each vector (row) in matrix A, return the
# index of and the distance to the index of the
# closest point to the matrix B
# If the matrix B is omitted, then it's assumed
# to be A, but no point is allowed to be its own
# own closest match
# The user can supply a pre-computed distance
# matrix as an optional argument
# Input: matrix A, matrix B (optional),
# matrix of distances between them (optional)
# Output: list of vectors, one giving the
# indices, the other giving the distances
nearest.points <- function(a,b=a,d=sqrt(sq.Euc.dist(a,b))) {
# "allocate" a vector, giving the distances to
# the best matches
b.dist = numeric(nrow(a))
if (identical(a,b)) {
diag(d) = Inf
}
b.which = apply(d,1,which.min)
for (i in 1:nrow(a)) {
b.dist[i] = d[i,b.which[i]]
}
return(list(which=b.which,dist=b.dist))
}