# Method 1 baseurl <- 'http://www.stat.cmu.edu/~cshalizi/statcomp/hw/09/' urls <- paste(baseurl, 'page', 1:4, '.html', sep = '') page <- lapply(urls, readLines, warn = FALSE) page <- lapply(page, paste, collapse = '\n') html <- do.call(paste, c(page, list(sep = '\n'))) pat <- paste( '\\s*', '(\\d+)\\s*', '\\s*', '[^\\s*', '

([^<]+)

\\s*', '\\s*', '\\$(\\d+\\.?\\d*)\\s*B\\s*', '([^<]+)\\s*', '([^<]+)\\s*', '([^<]+)\\s*', '', sep = '') # First pass: extract disjoint cases m <- gregexpr(pat, html, ignore.case = TRUE) x <- regmatches(html, m) x <- do.call(c, x) # Second pass: extract capture groups m <- regexec(pat, x, ignore.case = TRUE) wealthy <- regmatches(x, m) # Put it all together library(plyr) df <- ldply(wealthy, function(x) data.frame( rank = as.numeric(x[2]), name = x[3], net.worth = as.numeric(x[4]), age = as.numeric(x[5]), residence = x[6], source = x[7], stringsAsFactors = FALSE)) # Method 2 raw400 <- readLines('http://www.forbes.com/forbes-400/ajax/loadList') parseone <- function(html) { pat <- paste( '"(\\w+)"\\s*:\\s*([^,]*)' ) # First pass: extract disjoint cases m <- gregexpr(pat, html, ignore.case = TRUE) x <- regmatches(html, m) x <- do.call(c, x) # Second pass: extract capture groups m <- regexec(pat, x, ignore.case = TRUE) x <- regmatches(x, m) return(x) } pat <- paste( '"rank":(\\d+),', '"firstName":"([^"]*)",', '"lastName":"([^"]*)",', '"finalWorth":(\\d+\\.?\\d*),', '"age":(\\d+|\\-),', '"city":"([^"]*)",', '"state":"([^"]*)",', '"source":"([^"]*)",', '"industries":\\[([^\\]+)\\],', sep = '.*') # First pass: Extract disjoint cases --- Billionaires are separated by },{ split400 <- strsplit(raw400, '\\},\\{')[[1]] # Second pass: Extract capture groups m <- regexec(pat, split400, ignore.case = TRUE) wealthy <- regmatches(split400, m) # Put it all together library(plyr) df <- ldply(wealthy, function(x) data.frame( rank = as.numeric(x[2]), name = paste(x[3], x[4]), net.worth = as.numeric(x[5]), age = as.numeric(x[6]), residence = paste(x[7], ', ', x[8], sep = ''), source = x[9], industries = gsub('"', '', x[10]), # Extra stringsAsFactors = FALSE))