# First
html <- readLines('http://www.forbes.com/wealth/celebrities/list')
# Alternatively
html <- readLines('http://www.stat.cmu.edu/~cshalizi/statcomp/lectures/24/celebrity100.html')
# Start with one case: Tiger Woods
# How is his data encoded in the html?
i <- grep('Tiger Woods', html)
cat(html[(i-4):(i+11)])
# Another case: Lady Gaga
i <- grep('Lady Gaga', html)[2]
cat(html[(i-4):(i+11)])
# Make one long string
html <- paste(html, collapse = '\n')
# ==============================================================
# = After much debugging, we arrive at this regular expression =
# ==============================================================
pat <- paste(
'
',
'| (\\d+) | ',
'',
' ',
'([^>]+) | ',
'\\$([[:digit:]]+)\\s*M | ',
'(\\d+) | ',
'(\\d+) | ',
'(\\d+) | ',
'(\\d+) | ',
'(\\d+) | ',
'
',
sep = '\\s*')
# First pass: extract disjoint cases
m <- gregexpr(pat, html, ignore.case = TRUE)
x <- regmatches(html, m)
x <- do.call(c, x)
# Second pass: extract capture groups
m <- regexec(pat, x, ignore.case = TRUE)
celebs <- regmatches(x, m)
# Put it all together
library(plyr)
df <- ldply(celebs, function(x) data.frame(
rank = as.numeric(x[2]),
name = gsub('^\\s*|\\s*$', '', x[3]),
pay = as.numeric(x[4]),
money.rank = as.numeric(x[5]),
tvradio.rank = as.numeric(x[6]),
press.rank = as.numeric(x[7]),
web.rank = as.numeric(x[8]),
social.rank = as.numeric(x[9]),
stringsAsFactors = FALSE))
# ======================
# = Some Data Analysis =
# ======================
library(ggplot2)
qplot(x = web.rank, y = pay, data = df)
qplot(x = web.rank, y = pay, data = df, log = 'y')