statshome <- paste(readLines("http://www.stat.cmu.edu/",warn=FALSE), collapse="") slurp <- function(url) { return(paste(readLines(url,warn=FALSE), collapse=" ")) } raise.anchors <- function(text) { match.locations <- gregexpr('href=".+?"',text,useBytes=TRUE) return(regmatches(text, match.locations)[[1]]) } anchors.with.absolute.urls <- function(urls) { return(urls[grepl('"http://',urls)]) } extract.links <- function(text) { anchors <- raise.anchors(text) good.anchors <- anchors.with.absolute.urls(anchors) url.locations <- gregexpr('http://[^"]+?', good.anchors) absolute.urls <- regmatches(good.anchors,url.locations) absolute.urls <- do.call(c,absolute.urls) return(absolute.urls) } surf.from <- function(some.urls,home="http://crookedtimber.org/") { if (length(some.urls) == 0) { return(home) } else { return(sample(some.urls,size=1)) } } surf.to <- function(origin,target,max.iter=20,verbose=TRUE,...) { url <- origin count <- 0 while ( !(grepl(target, url, ignore.case=TRUE)) &&(count < max.iter)) { text <- slurp(url) outgoing.urls <- extract.links(text) url <- surf.from(outgoing.urls,...) if(verbose) { cat(paste("surfing to",url,"\n")) } count <- count+1 } return(count) } surf.to.mk2 <- function(origin,target,max.iter=20,verbose=TRUE,...) { url <- origin last_url <- url count <- 0 while ( !(grepl(target, url, ignore.case=TRUE)) &&(count < max.iter)) { text <- try(slurp(url)) # Error handling if (class(text) == "try-error") { # If slurp() yields an error url <- last_url } else { # If slurp() worked OK last_url <- url outgoing.urls <- extract.links(text) url <- surf.from(outgoing.urls,...) if(verbose) { cat(paste("surfing to",url,"\n")) } count <- count+1 } } return(count) }