# 1 x <- readLines('http://www.stat.cmu.edu/~cshalizi/statcomp/labs/09/dates.txt') #a pat <- "<[[:alpha:]]+[[:space:]]+[[:digit:]]+>" #b gregexpr(pat, x) #c i <- gregexpr(pat, x)[[1]] substring(x, i + 1, i + attr(i , "match.length") - 2) # 2 sj <- readLines('http://www.stat.cmu.edu/~cshalizi/statcomp/labs/09/stevejobs.txt') #a # # sj is a character vector. Its length is given by length(sj). # Each element of sj is a string corresponding to a line in the file stevejobs.txt. #b sj.long <- paste(sj, collapse = ' ') sj.sentences <- strsplit(sj.long, '[[:space:]]*[.!?]{1}[[:space:]]*')[[1]] #c i <- grep('life|death', sj.sentences, ignore.case = T) cat(sj.sentences[i], sep = '\n') #d sj.words <- strsplit(sj.long, '[[:space:]]*[.!?[:space:]]+[[:space:]]*') # or sj.words <- strsplit(sj.sentences, '[[:space:]]+') sj.words <- do.call(c, sj.words) sj.words <- tolower(sj.words) #e sj.words <- gsub('^[[:punct:]]|[[:punct:]]$', '', sj.words) #f wc <- table(sj.words)