# Functions for document retrieval and simple # similarity searching # Modified from code written by Tom Minka # Read in a document and turn it into a vector # of word-instances # by default, treats everything before the first # blank line as a header and remove it # Input: filename, flag for header removal # Calls: strip.text # Output: a vector of character strings, giving # the words in order read.doc <- function(fname,remove.header=T) { txt <- readLines(fname) if(remove.header) { # remove header i <- which(txt == "") if(length(i) > 0) { txt <- txt[-(1:i[1])] } } strip.text(txt) } # Turn a string into a vector of words # for comparability across bags of words, # also strip out punctuation and numbers, and # shift all letters into lower case # Input: character string # Output: vector of words (character strings) strip.text <- function(txt) { # remove apostrophes txt <- gsub("'","",txt) # convert to lowercase txt <- tolower(txt) # change other non-alphanumeric characters to spaces txt <- gsub("[^a-z0-9]"," ",txt) # change digits to # txt <- gsub("[0-9]+","#",txt) # split and make one vector txt <- unlist(strsplit(txt," ")) # remove empty words txt <- txt[txt != ""] return(txt) } # Rescale the columns of a data frame or array by # a given weight vector # Input: arrray, weight vector # Output: scaled array scale.cols <- function(x,s) { return(t(apply(x,1,function(x){x*s}))) } # Rescale rows by a given weight vec0) doc.freq[doc.freq == 0] <- 1 w <- log(nrow(x)/doc.freq) return(scale.cols(x,w)) } # Normalize vectors by the sum of their entries # Input assumed to be a set of vectors in array # form, one vector per row # QUESTION: What is the 1e-16 doing here? # Input: matrix/data frame/array # Calls: scale.rows() # Output: matrix/data frame/array div.by.sum <- function(x) { scale.rows(x,1/(rowSums(x)+1e-16)) } # Normalize vectors by their Euclidean length # Input assumed to be a set of vectors in array # form, one vector per row # QUESTION: What is the 1e-16 doing here? # Input: array # Calls: scale.rows() # Output: array div.by.euc.length <- function(x) { scale.rows(x,1/sqrt(rowSums(x^2)+1e-16)) } # Remove columns from a ragged array which only # appear in one row # Input: Ragged array (vectors with named columns) # Output: Ragged array, with columns appearing in only one vector deleted remove.singletons.ragged <- function(x) { # Collect all the column names, WITH repetition col.names <- c() for(i in 1:length(x)) { col.names <- c(col.names, names(x[[i]])) } # See how often each name appears count <- table(col.names) # Loop over vectors and keep only the columns # which show up more than once for(i in 1:length(x)) { not.single <- (count[names(x[[i]])] > 1) x[[i]] <- x[[i]][not.single] } return(x) } # Standardize a ragged array so all vectors # have the same length and ordering, supplying # NAs for missing values # Input: a list of vectors with named columns # Output: a standardized list of vectors with named columns standardize.ragged <- function(x) { # Keep track of all the column names from all # the vectors in a single vector col.names <- c() # Get the union of column names by iterating # over the vectors - using setdiff is faster # than taking unique of the concatenation (the) # more obvious approach for(i in 1:length(x)) { col.names <- c(col.names, setdiff(names(x[[i]]),col.names)) } # put the column names in order, for greater # comprehensibility col.names <- sort(col.names) # Now loop over the vectors again, putting # them in order and filling them out # Note that x[[y]] returns NA if y is not the # name of a column in x for (i in 1:length(x)) { x[[i]] <- x[[i]][col.names] # Make sure the names are right names(x[[i]]) <- col.names } return(x) } # Turn a list of bag-of-words vectors into a # unified data frame, one row per bag of words # Input: list of BoW vectors (x), # list of row names (row.names, optional), # flag for whether singletons should be removed, # flag for whether words missing in a document # should be coded 0 # Output: data frame, columns named by the words # and rows matching documents make.BoW.frame <- function(x,row.names,remove.singletons=TRUE,absent.is.zero=TRUE) { # Should we remove one-time-only words? if (remove.singletons) { y <- remove.singletons.ragged(x) } else { y <- x } # Standardize the column names y <- standardize.ragged(y) # Transform the list into an array # There are probably slicker ways to do this z = y[[1]] # Start with the first row if (length(y) > 1) { # More than one row? for (i in 2:length(y)) { z = rbind(z,y[[i]],deparse.level=0) # then stack them } } # Make the data frame # use row names if provided if(missing(row.names)) { BoW.frame <- data.frame(z) } else { BoW.frame <- data.frame(z,row.names=row.names) } if (absent.is.zero) { # The standardize.ragged function codes # missing words as "NA"; replace those with # zeroes to simplify calculation BoW.frame <- apply(BoW.frame,2,function(x){ifelse(is.na(x),0,x)}) # NB this x isn't our original argument! } return(BoW.frame) } # Produce a distance matrix from a data frame # Assumes rows in the data frame (or other array) # are vectors # By default uses Euclidean distance but could # use other functions as well # cf. the built-in function dist() # Input: array, optional distance function # Calls: sq.Euc.dist() # Output: matrix of distances distances <- function(x,fun) { # Use Euclidean distance by default if (missing(fun)) { return(sqrt(sq.Euc.dist(x))) } # otherwise, run the function fun over all # combinations of rows else { # make a new array n <- nrow(x) d <- array(NA,c(n,n),list(rownames(x),rownames(x))) #preserve row-names, but also make them column names # iterate over row-pair combinations for(i in 1:n) { for(j in 1:n) { # fill the entries of the array d[i,j] <- fun(x[i,],x[j,]) } } # we're done return(d) } } # calculate the squared Euclidean distances between # two sets of vectors # specifically, d[i,j] is the squared distance # from x[i,] to y[j,] # Input: vectors in matrix form (one per row), # second set of vectors ditto (if missing # assumed equal to first) # Output: matrix of distances sq.Euc.dist <- function(x,y=x) { x <- as.matrix(x) y <- as.matrix(y) nr=nrow(x) nc=nrow(y) x2 <- rowSums(x^2) xsq = matrix(x2,nrow=nr,ncol=nc) y2 <- rowSums(y^2) ysq = matrix(y2,nrow=nr,ncol=nc,byrow=TRUE) xy = x %*% t(y) d = xsq + ysq - 2*xy if(identical(x,y)) diag(d) = 0 d[which(d < 0)] = 0 return(d) } # For each vector (row) in matrix A, return the # index of and the distance to the index of the # closest point to the matrix B # If the matrix B is omitted, then it's assumed # to be A, but no point is allowed to be its own # own closest match # The user can supply a pre-computed distance # matrix as an optional argument # Input: matrix A, matrix B (optional), # matrix of distances between them (optional) # Output: list of vectors, one giving the # indices, the other giving the distances nearest.points <- function(a,b=a,d=sqrt(sq.Euc.dist(a,b))) { # "allocate" a vector, giving the distances to # the best matches b.dist = numeric(nrow(a)) if (identical(a,b)) { diag(d) = Inf } b.which = apply(d,1,which.min) for (i in 1:nrow(a)) { b.dist[i] = d[i,b.which[i]] } return(list(which=b.which,dist=b.dist)) }