# Method 1
baseurl <- 'http://www.stat.cmu.edu/~cshalizi/statcomp/hw/09/'
urls <- paste(baseurl, 'page', 1:4, '.html', sep = '')
page <- lapply(urls, readLines, warn = FALSE)
page <- lapply(page, paste, collapse = '\n')
html <- do.call(paste, c(page, list(sep = '\n')))
pat <- paste(
'
\\s*',
'| (\\d+) | \\s*',
'\\s*',
' \\s*',
'([^<]+)\\s*',
' | \\s*',
'\\$(\\d+\\.?\\d*)\\s*B | \\s*',
'([^<]+) | \\s*',
'([^<]+) | \\s*',
'([^<]+) | \\s*',
'
',
sep = '')
# First pass: extract disjoint cases
m <- gregexpr(pat, html, ignore.case = TRUE)
x <- regmatches(html, m)
x <- do.call(c, x)
# Second pass: extract capture groups
m <- regexec(pat, x, ignore.case = TRUE)
wealthy <- regmatches(x, m)
# Put it all together
library(plyr)
df <- ldply(wealthy, function(x) data.frame(
rank = as.numeric(x[2]),
name = x[3],
net.worth = as.numeric(x[4]),
age = as.numeric(x[5]),
residence = x[6],
source = x[7],
stringsAsFactors = FALSE))
# Method 2
raw400 <- readLines('http://www.forbes.com/forbes-400/ajax/loadList')
parseone <- function(html) {
pat <- paste(
'"(\\w+)"\\s*:\\s*([^,]*)'
)
# First pass: extract disjoint cases
m <- gregexpr(pat, html, ignore.case = TRUE)
x <- regmatches(html, m)
x <- do.call(c, x)
# Second pass: extract capture groups
m <- regexec(pat, x, ignore.case = TRUE)
x <- regmatches(x, m)
return(x)
}
pat <- paste(
'"rank":(\\d+),',
'"firstName":"([^"]*)",',
'"lastName":"([^"]*)",',
'"finalWorth":(\\d+\\.?\\d*),',
'"age":(\\d+|\\-),',
'"city":"([^"]*)",',
'"state":"([^"]*)",',
'"source":"([^"]*)",',
'"industries":\\[([^\\]+)\\],',
sep = '.*')
# First pass: Extract disjoint cases --- Billionaires are separated by },{
split400 <- strsplit(raw400, '\\},\\{')[[1]]
# Second pass: Extract capture groups
m <- regexec(pat, split400, ignore.case = TRUE)
wealthy <- regmatches(split400, m)
# Put it all together
library(plyr)
df <- ldply(wealthy, function(x) data.frame(
rank = as.numeric(x[2]),
name = paste(x[3], x[4]),
net.worth = as.numeric(x[5]),
age = as.numeric(x[6]),
residence = paste(x[7], ', ', x[8], sep = ''),
source = x[9],
industries = gsub('"', '', x[10]), # Extra
stringsAsFactors = FALSE))