# Functions for document retrieval and simple similarity searching
# Modified from code written by Tom Minka
# Overview: These are functions to
# (1) read in XML documents in the format of the New York Times Annotated
# Corpus data set and convert the body of the story into a vector of
# character strips (primarily: read.doc)
# (2) Normalize those character strings by removing punctuation,
# converting all words to lower cae, etc. (primarily: strip.text)
# (3) Turn each document into a bag-of-words _vector_, counting word
# occurences
# (4) Integrate multiple bag-of-words vectors into a single data frame,
# with rows for each document and columns formthe union of all words
# appearing across all documents
# (5) Calculate Euclidean distances between documents and find the
# closest match to a given document
# Read in an XML news story and extract its full text
# Input: filename
# Calls: XML package (available from CRAN)
# strip.text() [defined below]
# Assumes: the input is a single file, with XML formatted as in the New York
# Times Annotated Corpus (does not test this)
# Assumes: filename in fact names a single file
# Output: vector of character strings, giving the words in order
read.doc <- function(filename) {
require(XML)
doc <- xmlRoot(xmlInternalTreeParse(filename))
# ASSUMES: filename specifies a single file
# ASSUMES: document follows nitf format
node.set <- getNodeSet(doc,path="//block[@class='full_text']")
# Find the parts of the file which say they contain the full text of the
# news story; there should be only one
fulltext <- sapply(node.set,xmlValue) # get the text
# ASSUMES: this should be a SINGLE character string
text <- strip.text(fulltext) # Turn into a vector of strings
return(text)
}
# Feel free to use this IF you comment it
read.directory <- function(dirname,verbose=FALSE) {
stories = list()
filenames = dir(dirname,full.names=TRUE)
for (i in 1:length(filenames)) {
if(verbose) {
print(filenames[i])
}
stories[[i]] = read.doc(filenames[i])
}
return(stories)
}
# Turn a string into a vector of words
# for comparability across bags of words, also strip out punctuation and
# numbers, and shift all letters into lower case
# Input: character string
# Output: vector of words (character strings)
strip.text <- function(txt) {
# remove apostrophes (so "don't" -> "dont", "Jane's" -> "Janes", etc.)
txt <- gsub("'","",txt)
# convert to lowercase
txt <- tolower(txt)
# change other non-alphanumeric characters to spaces
txt <- gsub("[^a-z0-9]"," ",txt)
# change digits to #
txt <- gsub("[0-9]+","#",txt)
# split and make one vector
txt <- unlist(strsplit(txt," "))
# remove empty words
txt <- txt[txt != ""]
return(txt)
}
# Remove columns from a ragged array which only appear in one row
# Input: Ragged array (vectors with named columns)
# Output: Ragged array, with columns appearing in only one vector deleted
remove.singletons.ragged <- function(x) {
# Collect all the column names, WITH repetition
col.names <- c()
for(i in 1:length(x)) {
col.names <- c(col.names, names(x[[i]]))
}
# See how often each name appears
count <- table(col.names)
# Loop over vectors and keep only the columns which show up more than once
for(i in 1:length(x)) {
not.single <- (count[names(x[[i]])] > 1)
x[[i]] <- x[[i]][not.single]
}
return(x)
}
# Standardize a ragged array so all vectors have the same length and ordering
# Supplies NAs for missing values
# Input: a list of vectors with named columns
# Output: a standardized list of vectors with named columns
standardize.ragged <- function(x) {
# Keep track of all the column names from all the vectors in a single vector
col.names <- c()
# Get the union of column names by iterating over the vectors - using
# setdiff() is faster than taking unique of the concatenation, the more
# obvious approach
for(i in 1:length(x)) {
col.names <- c(col.names, setdiff(names(x[[i]]),col.names))
}
# put the column names in alphabetical order, for greater comprehensibility
col.names <- sort(col.names)
# Now loop over the vectors again, putting them in order and filling them out
# Note: x[[y]] returns NA if y is not the name of a column in x
for (i in 1:length(x)) {
x[[i]] <- x[[i]][col.names]
# Make sure the names are right
names(x[[i]]) <- col.names
}
return(x)
}
# Turn a list of bag-of-words vectors into a data frame, one row per bag
# Input: list of BoW vectors (x),
# list of row names (row.names, optional),
# flag for whether singletons should be removed,
# flag for whether words missing in a document should be coded 0
# Output: data frame, columns named by the words and rows matching documents
make.BoW.frame <- function(x,row.names,remove.singletons=TRUE,
absent.is.zero=TRUE) {
# Should we remove one-time-only words?
if (remove.singletons) {
y <- remove.singletons.ragged(x)
} else {
y <- x
}
# Standardize the column names
y <- standardize.ragged(y)
# Transform the list into an array
# There are probably slicker ways to do this
z = y[[1]] # Start with the first row
if (length(y) > 1) { # More than one row?
for (i in 2:length(y)) {
z = rbind(z,y[[i]],deparse.level=0) # then stack them
}
}
# Make the data frame
# use row names if provided
if(missing(row.names)) {
BoW.frame <- data.frame(z)
} else {
BoW.frame <- data.frame(z,row.names=row.names)
}
if (absent.is.zero) {
# The standardize.ragged function maps missing words to "NA"; replace
# those with zeroes to simplify calculation
BoW.frame <- apply(BoW.frame,2,function(q){ifelse(is.na(q),0,q)})
}
return(BoW.frame)
}
# Rescale the columns of a data frame or array by a given weight vector
# Input: arrray, weight vector
# Output: scaled array
scale.cols <- function(x,s) {
return(t(apply(x,1,function(x){x*s})))
}
# Rescale rows of an array or data frame by a given weight vector
# Input: array, weight vector
# Output: scaled array
scale.rows <- function(x,s) {
return(apply(x,2,function(x){x*s}))
}
# Compute inverse document frequency weights and rescale a data frame
# Input: data frame
# Calls: scale.cols
# Output: scaled data-frame
idf.weight <- function(x) {
# IDF weighting
doc.freq <- colSums(x>0)
doc.freq[doc.freq == 0] <- 1
w <- log(nrow(x)/doc.freq)
return(scale.cols(x,w))
}
# Normalize vectors by the sum of their entries
# Input assumed to be a set of vectors in array form, one vector per row
# Input: matrix/data frame/array
# Calls: scale.rows()
# Presumes: No rows sum to 0
# Output: matrix/data frame/array
div.by.sum <- function(x) {
scale.rows(x,1/(rowSums(x)))
}
# Normalize vectors by their Euclidean length
# Input assumed to be a set of vectors in array form, one vector per row
# Input: array
# Calls: scale.rows()
# Presumes: No rows sum to 0
# Output: array
div.by.euc.length <- function(x) {
scale.rows(x,1/sqrt(rowSums(x^2)))
}
# Produce a distance matrix from a data frame
# Assumes rows in the data frame (or other array) are vectors
# By default uses Euclidean distance but could use other functions as well
# cf. the built-in function dist()
# Input: array, optional distance function
# Calls: sq.Euc.dist()
# Output: matrix of distances
distances <- function(x,fun) {
# Use Euclidean distance by default
if (missing(fun)) {
return(sqrt(sq.Euc.dist(x)))
}
# otherwise, run the function fun over all combinations of rows
else {
# make a new array
n <- nrow(x)
d <- array(NA,c(n,n),list(rownames(x),rownames(x))) #preserve row-names,
# but also make them column names
# iterate over row-pair combinations
for(i in 1:n) {
for(j in 1:n) {
# fill the entries of the array
d[i,j] <- fun(x[i,],x[j,])
}
}
# we're done
return(d)
}
}
# Calculate the squared Euclidean distances between two sets of vectors
# specifically, d[i,j] is the squared distance from x[i,] to y[j,]
# Input: vectors in matrix form (one per row),
# second set of vectors ditto (if missing assumed equal to first)
# Output: matrix of distances
sq.Euc.dist <- function(x,y=x) {
x <- as.matrix(x)
y <- as.matrix(y)
nr=nrow(x)
nc=nrow(y)
x2 <- rowSums(x^2)
xsq = matrix(x2,nrow=nr,ncol=nc)
y2 <- rowSums(y^2)
ysq = matrix(y2,nrow=nr,ncol=nc,byrow=TRUE)
xy = x %*% t(y)
d = xsq + ysq - 2*xy
if(identical(x,y)) diag(d) = 0
d[which(d < 0)] = 0
return(d)
}
# For each vector (row) in matrix A, return the index of and the distance to
# the index of the closest point to the matrix B
# If the matrix B is omitted, then it's assumed to be A, but no point is
# allowed to be its own own closest match
# A pre-computed distance matrix is an optional argument, otherwise it's
# computed in the squared Euclidean metric
# Input: matrix A, matrix B (optional),
# matrix of distances between them (optional)
# Output: list of two vectors, one ($which) giving the indices of best matches,
# the other ($dist) giving the distances to those matches
nearest.points <- function(a,b=a,d=sqrt(sq.Euc.dist(a,b))) {
# "allocate" a vector, giving the distances to the best matches
b.dist = numeric(nrow(a))
if (identical(a,b)) {
diag(d) = Inf
}
b.which = apply(d,1,which.min)
for (i in 1:nrow(a)) {
b.dist[i] = d[i,b.which[i]]
}
return(list(which=b.which,dist=b.dist))
}