# ===================================================== # = Find all weblinks: Extract both URL and Link text = # ===================================================== cmustat <- readLines('http://www.stat.cmu.edu/') # I paste together the pieces of the regular expression for legibility linkpat <- paste( ']*>', '(.*)', '', sep = '') # Of course this works too: linkpat <- ']*>(.*)' # Two-pass approach. # First pass: extract all disjoint matches into separate strings # Second pass: extract 1 match and 2 groups from each string # Extract all disjoint matches m <- gregexpr(linkpat, cmustat, ignore.case = TRUE) links <- regmatches(cmustat, m) links <- do.call(c, links) # Extract match and groups m <- regexec(linkpat, links, ignore.case = TRUE) links <- regmatches(links, m) # Create a data frame with url and link.text # Set stringsAsFactors = FALSE so that the strings don't get converted to factors df <- ldply(links, function(x) data.frame(url = x[2], link.text = x[3], stringsAsFactors = FALSE)) # ========================================= # = Generic function to extract web links = # ========================================= extractLinks <- function(html) { require(plyr) linkpat <- ']*>(.*)' m <- gregexpr(linkpat, html, ignore.case = TRUE) links <- regmatches(html, m) links <- do.call(c, links) m <- regexec(linkpat, links, ignore.case = TRUE) links <- regmatches(links, m) df <- ldply(links, function(x) data.frame(url = x[2], link.text = x[3], stringsAsFactors = FALSE)) return(df) } # Examples cmulinks <- extractLinks(readLines('http://cmu.edu/', warn = FALSE)) nytimeslinks <- extractLinks(readLines('http://nytimes.com/', warn = FALSE)) # ================= # = Random Surfer = # ================= randomSurf <- function(url) { cat('Visiting', url, '\n') html <- readLines(url, warn = FALSE) links <- extractLinks(html) # Only look at fully-qualified, non-encrypted URLs (because relative URLs # are too much to deal with in this example) j <- grepl('^(http:)', links$url, ignore.case = TRUE) links <- links[j, ] # Dead end? if(nrow(links) == 0) { stop('D\'oh! I\'m at a deadend') } # Draw a uniform random integer from 1 to nrow(links) i <- sample(nrow(links), size = 1) # Pick the next url at random nexturl <- links$url[i] return(nexturl) } # ========================== # = Test the random surfer = # ========================== current <- 'http://www.yahoo.com' for(i in 1:10) { current <- randomSurf(current) } # ======================== # = Bouncy Random Surfer = # ======================== bouncySurf <- function(url, previousurl) { result <- try(randomSurf(url)) if(class(result) == 'try-error') { nexturl <- previousurl } else { nexturl <- result } return(nexturl) } # ================================= # = Test the bouncy random surfer = # ================================= previous <- NULL current <- 'http://www.yahoo.com' for(i in 1:10) { nexturl <- bouncySurf(current, previous) previous <- current current <- nexturl } # ================================== # = Do all roads lead to Facebook? = # ================================== surfUntilFacebook <- function(start) { nsteps <- 0 previous <- NULL current <- start while(grepl('facebook.com', current, ignore.case = TRUE) == FALSE) { nexturl <- bouncySurf(current, previous) previous <- current current <- nexturl nsteps <- nsteps + 1 cat('Next:', nexturl) } cat('Hit Facebook after', nsteps, 'steps\n') return(nsteps) } surfUntilFacebook('http://www.yahoo.com/')