# ====================================== # = Find e-mail addresses on a webpage = # ====================================== cmustat <- readLines('http://www.stat.cmu.edu/people/faculty') emailpattern <- '[[:alnum:]\\-_.+]+@[[:alnum:]\\-_.]+\\.[[:alpha:]]+' grep(emailpattern, cmustat, value = TRUE) # allow whitespace around @ mark emailpattern2 <- '[[:alnum:]\\-_.+]+[[:space:]]*@[[:space:]]*[[:alnum:]\\-_.%+]+\\.[[:alpha:]]+' grep(emailpattern2, cmustat, value = TRUE) # Concatenate into one long string cmustat <- paste(cmustat, collapse = '\n') # Get indices and lengths of matches m <- gregexpr(pattern = emailpattern2, cmustat) # Extract substrings emails <- regmatches(cmustat, m) # Concatenate multiple matches into a single character vector emails <- do.call(c, emails) # Remove whitespace emails <- gsub('[[:space:]]+', '', emails) # Remove duplicates emails <- unique(emails) # ===================== # = Find all weblinks = # ===================== cmustat <- readLines('http://www.stat.cmu.edu/') linkpat <- ']*>' grep(linkpat, cmustat, value = TRUE, ignore.case = TRUE) m <- gregexpr(linkpat, cmustat, ignore.case = TRUE) urls <- regmatches(cmustat, m) # Concatenate all N matches into a character vector of length N urls <- do.call(c, urls) # Extract the URL part of by tagging the subexpression linkpat2 <- ']*>' m <- regexec(linkpat2, urls, ignore.case = TRUE) urls <- regmatches(urls, m) # Extract the second item of each element of urls --- first item is the match, # second is the first group, ... library(plyr) urls <- laply(urls, function(x) x[2]) # Remove empty strings urls <- urls[nchar(urls) > 0]