# ======================================
# = Find e-mail addresses on a webpage =
# ======================================
cmustat <- readLines('http://www.stat.cmu.edu/people/faculty')
emailpattern <- '[[:alnum:]\\-_.+]+@[[:alnum:]\\-_.]+\\.[[:alpha:]]+'
grep(emailpattern, cmustat, value = TRUE)
# allow whitespace around @ mark
emailpattern2 <- '[[:alnum:]\\-_.+]+[[:space:]]*@[[:space:]]*[[:alnum:]\\-_.%+]+\\.[[:alpha:]]+'
grep(emailpattern2, cmustat, value = TRUE)
# Concatenate into one long string
cmustat <- paste(cmustat, collapse = '\n')
# Get indices and lengths of matches
m <- gregexpr(pattern = emailpattern2, cmustat)
# Extract substrings
emails <- regmatches(cmustat, m)
# Concatenate multiple matches into a single character vector
emails <- do.call(c, emails)
# Remove whitespace
emails <- gsub('[[:space:]]+', '', emails)
# Remove duplicates
emails <- unique(emails)
# =====================
# = Find all weblinks =
# =====================
cmustat <- readLines('http://www.stat.cmu.edu/')
linkpat <- ']*>'
grep(linkpat, cmustat, value = TRUE, ignore.case = TRUE)
m <- gregexpr(linkpat, cmustat, ignore.case = TRUE)
urls <- regmatches(cmustat, m)
# Concatenate all N matches into a character vector of length N
urls <- do.call(c, urls)
# Extract the URL part of by tagging the subexpression
linkpat2 <- ']*>'
m <- regexec(linkpat2, urls, ignore.case = TRUE)
urls <- regmatches(urls, m)
# Extract the second item of each element of urls --- first item is the match,
# second is the first group, ...
library(plyr)
urls <- laply(urls, function(x) x[2])
# Remove empty strings
urls <- urls[nchar(urls) > 0]