# First html <- readLines('http://www.forbes.com/wealth/celebrities/list') # Alternatively html <- readLines('http://www.stat.cmu.edu/~cshalizi/statcomp/lectures/24/celebrity100.html') # Start with one case: Tiger Woods # How is his data encoded in the html? i <- grep('Tiger Woods', html) cat(html[(i-4):(i+11)]) # Another case: Lady Gaga i <- grep('Lady Gaga', html)[2] cat(html[(i-4):(i+11)]) # Make one long string html <- paste(html, collapse = '\n') # ============================================================== # = After much debugging, we arrive at this regular expression = # ============================================================== pat <- paste( '', '(\\d+)', '', '[^', '

([^>]+)

', '\\$([[:digit:]]+)\\s*M', '(\\d+)', '(\\d+)', '(\\d+)', '(\\d+)', '(\\d+)', '', sep = '\\s*') # First pass: extract disjoint cases m <- gregexpr(pat, html, ignore.case = TRUE) x <- regmatches(html, m) x <- do.call(c, x) # Second pass: extract capture groups m <- regexec(pat, x, ignore.case = TRUE) celebs <- regmatches(x, m) # Put it all together library(plyr) df <- ldply(celebs, function(x) data.frame( rank = as.numeric(x[2]), name = gsub('^\\s*|\\s*$', '', x[3]), pay = as.numeric(x[4]), money.rank = as.numeric(x[5]), tvradio.rank = as.numeric(x[6]), press.rank = as.numeric(x[7]), web.rank = as.numeric(x[8]), social.rank = as.numeric(x[9]), stringsAsFactors = FALSE)) # ====================== # = Some Data Analysis = # ====================== library(ggplot2) qplot(x = web.rank, y = pay, data = df) qplot(x = web.rank, y = pay, data = df, log = 'y')