# =====================================================
# = Find all weblinks: Extract both URL and Link text =
# =====================================================
cmustat <- readLines('http://www.stat.cmu.edu/')
# I paste together the pieces of the regular expression for legibility
linkpat <- paste(
']*>',
'(.*)',
'', sep = '')
# Of course this works too:
linkpat <- ']*>(.*)'
# Two-pass approach.
# First pass: extract all disjoint matches into separate strings
# Second pass: extract 1 match and 2 groups from each string
# Extract all disjoint matches
m <- gregexpr(linkpat, cmustat, ignore.case = TRUE)
links <- regmatches(cmustat, m)
links <- do.call(c, links)
# Extract match and groups
m <- regexec(linkpat, links, ignore.case = TRUE)
links <- regmatches(links, m)
# Create a data frame with url and link.text
# Set stringsAsFactors = FALSE so that the strings don't get converted to factors
df <- ldply(links, function(x) data.frame(url = x[2], link.text = x[3],
stringsAsFactors = FALSE))
# =========================================
# = Generic function to extract web links =
# =========================================
extractLinks <- function(html) {
require(plyr)
linkpat <- ']*>(.*)'
m <- gregexpr(linkpat, html, ignore.case = TRUE)
links <- regmatches(html, m)
links <- do.call(c, links)
m <- regexec(linkpat, links, ignore.case = TRUE)
links <- regmatches(links, m)
df <- ldply(links, function(x) data.frame(url = x[2], link.text = x[3],
stringsAsFactors = FALSE))
return(df)
}
# Examples
cmulinks <- extractLinks(readLines('http://cmu.edu/', warn = FALSE))
nytimeslinks <- extractLinks(readLines('http://nytimes.com/', warn = FALSE))
# =================
# = Random Surfer =
# =================
randomSurf <- function(url) {
cat('Visiting', url, '\n')
html <- readLines(url, warn = FALSE)
links <- extractLinks(html)
# Only look at fully-qualified, non-encrypted URLs (because relative URLs
# are too much to deal with in this example)
j <- grepl('^(http:)', links$url, ignore.case = TRUE)
links <- links[j, ]
# Dead end?
if(nrow(links) == 0) {
stop('D\'oh! I\'m at a deadend')
}
# Draw a uniform random integer from 1 to nrow(links)
i <- sample(nrow(links), size = 1)
# Pick the next url at random
nexturl <- links$url[i]
return(nexturl)
}
# ==========================
# = Test the random surfer =
# ==========================
current <- 'http://www.yahoo.com'
for(i in 1:10) {
current <- randomSurf(current)
}
# ========================
# = Bouncy Random Surfer =
# ========================
bouncySurf <- function(url, previousurl) {
result <- try(randomSurf(url))
if(class(result) == 'try-error') {
nexturl <- previousurl
} else {
nexturl <- result
}
return(nexturl)
}
# =================================
# = Test the bouncy random surfer =
# =================================
previous <- NULL
current <- 'http://www.yahoo.com'
for(i in 1:10) {
nexturl <- bouncySurf(current, previous)
previous <- current
current <- nexturl
}
# ==================================
# = Do all roads lead to Facebook? =
# ==================================
surfUntilFacebook <- function(start) {
nsteps <- 0
previous <- NULL
current <- start
while(grepl('facebook.com', current, ignore.case = TRUE) == FALSE) {
nexturl <- bouncySurf(current, previous)
previous <- current
current <- nexturl
nsteps <- nsteps + 1
cat('Next:', nexturl)
}
cat('Hit Facebook after', nsteps, 'steps\n')
return(nsteps)
}
surfUntilFacebook('http://www.yahoo.com/')