5 November 2014

## Agenda

• Quick Review: Split-Apply-Combine
• base R functions: *apply(), by(), aggregate(), split()

## General strategy

These are tools that, when put together, give us three general steps:

• Split whatever data object we have into meaningful chunks
• Apply the function of interest to this division
• Combine the results into a new object.
x <- split(strikes, strikes$country) y <- lapply(x, strikes_vs_left, coefficients.only=TRUE) coefs <- do.call(rbind, y) ## *apply in base R • apply() – arrays • lapply() – list or vector, output list • sapply() – list or vector, simplify to vector • vapply() – list or vector, safer simplify to vector • tapply(), by(), aggregate() – data frames (tables) • mapply() – multiple vectors (special case of 2d array) Useful set, but some issues: inconsistent output ## Back to the drawing board: plyr We have three common data structures beyond vectors: arrays (matrices), data frames, lists. #install.packages ("plyr") library(plyr) Functions within: ?*ply(), replace ? with character denoting types: • first character: input type a, d, l • second character: output type: a, d, l, or "drop" _ ## a*ply() – input arrays processed <- a*ply(.data, .margins, .fun, ...) • .data – an array • .margins – subscripts which the function gets applied over • .fun – the function to be applied • ... – additional arguments to function Looks like processed <- apply (data, margin, function) ## a*ply() – input arrays Let's test it out! my.boring.array <- array(1:27, c(3,3,3)) rownames(my.boring.array) <- c("Curly", "Larry", "Moe") colnames(my.boring.array) <- c("Groucho", "Harpo", "Zeppo") dimnames(my.boring.array)[[3]] <- c("Bart", "Lisa", "Maggie") adply (my.boring.array, 1, sum) ## X1 V1 ## 1 Curly 117 ## 2 Larry 126 ## 3 Moe 135 ## a*ply() – input arrays alply (my.boring.array, 2, sum) ##$1
## [1] 99
##
## $2 ## [1] 126 ## ##$3
## [1] 153
##
## attr(,"split_type")
## [1] "array"
## attr(,"split_labels")
##        X1
## 1 Groucho
## 2   Harpo
## 3   Zeppo

## a*ply() – input arrays

aaply (my.boring.array, 3, sum)
##   Bart   Lisa Maggie
##     45    126    207

## a*ply() – input arrays

adply (my.boring.array, 1:2, sum)
##      X1      X2 V1
## 1 Curly Groucho 30
## 2 Larry Groucho 33
## 3   Moe Groucho 36
## 4 Curly   Harpo 39
## 5 Larry   Harpo 42
## 6   Moe   Harpo 45
## 7 Curly   Zeppo 48
## 8 Larry   Zeppo 51
## 9   Moe   Zeppo 54

## a*ply() – input arrays

alply (my.boring.array, 2:3, sum)
## $1 ## [1] 6 ## ##$2
## [1] 15
##
## $3 ## [1] 24 ## ##$4
## [1] 33
##
## $5 ## [1] 42 ## ##$6
## [1] 51
##
## $7 ## [1] 60 ## ##$8
## [1] 69
##

## d*ply – operate on data frames

Strikes example again:

strikes <- read.csv("strikes.csv")
my.strike.lm.better <- function (country.df) {
lm(strike.volume ~ left.parliament + unemployment + inflation,
data=country.df)$coefficients } ## d*ply – operate on data frames Syntax: for data frame inputs, include the variables for the splitting in a stand-in function .() – syntax like by(). Output to array: results <- daply (strikes, .(country), my.strike.lm.better) results[1:5,] ## ## country (Intercept) left.parliament unemployment inflation ## Australia 157.9191 0.5658674 -1.1181489 30.4666061 ## Austria 600.6778 -11.2441604 -10.9216990 -0.5923972 ## Belgium -243.4823 12.4516118 0.3578217 10.2673539 ## Canada 167.0712 13.4386408 -48.1790264 27.2180651 ## Denmark -1331.7517 33.1605982 -8.1864655 5.1826201 ## d*ply – operate on data frames Output to data frame: in this case, same as array results <- ddply (strikes, .(country), my.strike.lm.better) results[1:5,] ## country (Intercept) left.parliament unemployment inflation ## 1 Australia 157.9191 0.5658674 -1.1181489 30.4666061 ## 2 Austria 600.6778 -11.2441604 -10.9216990 -0.5923972 ## 3 Belgium -243.4823 12.4516118 0.3578217 10.2673539 ## 4 Canada 167.0712 13.4386408 -48.1790264 27.2180651 ## 5 Denmark -1331.7517 33.1605982 -8.1864655 5.1826201 ## d*ply – operate on data frames Output to list: what you would expect. results <- dlply (strikes, .(country), my.strike.lm.better) results[1:3] ##$Australia
##     (Intercept) left.parliament    unemployment       inflation
##     157.9191118       0.5658674      -1.1181489      30.4666061
##
## $Austria ## (Intercept) left.parliament unemployment inflation ## 600.6777769 -11.2441604 -10.9216990 -0.5923972 ## ##$Belgium
##     (Intercept) left.parliament    unemployment       inflation
##    -243.4822938      12.4516118       0.3578217      10.2673539

## l*ply – operate on lists

processed <- l*ply(.data, , .fun, ...)
• .data – an array
• .fun – the function to be applied
• ... – additional arguments to function

Looks like

processed <- lapply (data, function, ...)

For demo sake:

strikes.split <- split (strikes, strikes$country) ## l*ply – operate on lists results <- laply (strikes.split, my.strike.lm.better) results[1:5,] ## (Intercept) left.parliament unemployment inflation ## [1,] 157.9191 0.5658674 -1.1181489 30.4666061 ## [2,] 600.6778 -11.2441604 -10.9216990 -0.5923972 ## [3,] -243.4823 12.4516118 0.3578217 10.2673539 ## [4,] 167.0712 13.4386408 -48.1790264 27.2180651 ## [5,] -1331.7517 33.1605982 -8.1864655 5.1826201 ## l*ply – operate on lists results <- ldply (strikes.split, my.strike.lm.better) results[1:5,] ## .id (Intercept) left.parliament unemployment inflation ## 1 Australia 157.9191 0.5658674 -1.1181489 30.4666061 ## 2 Austria 600.6778 -11.2441604 -10.9216990 -0.5923972 ## 3 Belgium -243.4823 12.4516118 0.3578217 10.2673539 ## 4 Canada 167.0712 13.4386408 -48.1790264 27.2180651 ## 5 Denmark -1331.7517 33.1605982 -8.1864655 5.1826201 ## l*ply – operate on lists results <- llply (strikes.split, my.strike.lm.better) results[1:3] ##$Australia
##     (Intercept) left.parliament    unemployment       inflation
##     157.9191118       0.5658674      -1.1181489      30.4666061
##
## $Austria ## (Intercept) left.parliament unemployment inflation ## 600.6777769 -11.2441604 -10.9216990 -0.5923972 ## ##$Belgium
##     (Intercept) left.parliament    unemployment       inflation
##    -243.4822938      12.4516118       0.3578217      10.2673539

## More dimensions

Recall the NHL data from last time:

print(load ("all-team-seasons-20132014.RData"))
## [1] "teamtable"
head (teamtable)
##      FAC_WIN FAC_LOSE PENL_TAKEN PENL_DRAWN HIT HIT_TAKEN CF CA FF FA SF
## MTL        4        1          2          1   3         6 12  5 10  4  4
## MTL1       3        2          0          0   0         3 11  9  7  5  6
## MTL2       6        7          2          2   9         7 12 13 11  9 10
## MTL3       5        2          2          1   5         6 21 12 16  8  9
## MTL4       2        1          3          0   0         0  2  1  2  1  2
## MTL5       3        2          2          2   0         1  3  1  1  1  1
##      SA GF GA Off Neu Def   TOI team opponent   season gcode scorediffcat
## MTL   3  1  0   2   3   0 632.0  MTL      TOR 20132014 20001            1
## MTL1  3  0  1   1   3   1 459.5  MTL      TOR 20132014 20001            2
## MTL2  8  1  0   3   4   6 641.0  MTL      TOR 20132014 20001            3
## MTL3  5  0  1   3   2   2 641.0  MTL      TOR 20132014 20001            4
## MTL4  1  0  0   2   1   0  91.0  MTL      TOR 20132014 20001            2
## MTL5  1  0  1   4   1   0 190.5  MTL      TOR 20132014 20001            3
##      gamestate home
## MTL          1    1
## MTL1         1    1
## MTL2         1    1
## MTL3         1    1
## MTL4         2    1
## MTL5         2    1
teamtable <- subset (teamtable, scorediffcat < 6)

## More dimensions, complexities

Split by team name, version 1:

teamperf <- ddply (teamtable[,c("GF","GA","HIT","HIT_TAKEN","team")],
.(team), function(dd) colSums(dd[,-5]))

## More dimensions, complexities

Split by team name:

teamperf <- ddply (teamtable, .(team),
function(dd) colSums(dd[,c("GF","GA","HIT","HIT_TAKEN")]))
head(teamperf)
##   team  GF  GA  HIT HIT_TAKEN
## 1  ANA 298 240 2515      2512
## 2  BOS 287 197 2441      2466
## 3  BUF 150 243 1992      1537
## 4  CAR 204 226 1807      1965
## 5  CBJ 241 234 2901      2109
## 6  CGY 201 238 1708      1463

## More dimensions, complexities

Split by team name and home/away:

teamperf <- ddply (teamtable, .(team, home),
function(dd) colSums(dd[,c("GF","GA","HIT","HIT_TAKEN")]))
head(teamperf)
##   team home  GF  GA  HIT HIT_TAKEN
## 1  ANA    0 136 129 1079      1245
## 2  ANA    1 162 111 1436      1267
## 3  BOS    0 137 113 1106      1270
## 4  BOS    1 150  84 1335      1196
## 5  BUF    0  65 124 1039       906
## 6  BUF    1  85 119  953       631

## More dimensions, complexities

Split by team name and game ID: The lethal case from last time

teamperf <- ddply (teamtable, .(team, gcode),
function(dd) colSums(dd[,c("GF","GA","HIT","HIT_TAKEN")]))
dim(teamperf)  ## worst case: 30*1323 rows.
## [1] 2646    6
head(teamperf)
##   team gcode GF GA HIT HIT_TAKEN
## 1  ANA 20006  1  6  13        21
## 2  ANA 20029  4  3  20        23
## 3  ANA 20033  3  2  21        25
## 4  ANA 20057  6  0  22        26
## 5  ANA 20078  4  1  17        28
## 6  ANA 20095  3  2  21        22

## More Bells And Whistles

plyr comes with a number of helper functions:

teamtable <- mutate (teamtable, blocksfor = CF - FF, blocksagainst = CA - FA, missesfor = FF - SF, missesagainst = FA - SA)
head(teamtable)
##      FAC_WIN FAC_LOSE PENL_TAKEN PENL_DRAWN HIT HIT_TAKEN CF CA FF FA SF
## MTL        4        1          2          1   3         6 12  5 10  4  4
## MTL1       3        2          0          0   0         3 11  9  7  5  6
## MTL2       6        7          2          2   9         7 12 13 11  9 10
## MTL3       5        2          2          1   5         6 21 12 16  8  9
## MTL4       2        1          3          0   0         0  2  1  2  1  2
## MTL5       3        2          2          2   0         1  3  1  1  1  1
##      SA GF GA Off Neu Def   TOI team opponent   season gcode scorediffcat
## MTL   3  1  0   2   3   0 632.0  MTL      TOR 20132014 20001            1
## MTL1  3  0  1   1   3   1 459.5  MTL      TOR 20132014 20001            2
## MTL2  8  1  0   3   4   6 641.0  MTL      TOR 20132014 20001            3
## MTL3  5  0  1   3   2   2 641.0  MTL      TOR 20132014 20001            4
## MTL4  1  0  0   2   1   0  91.0  MTL      TOR 20132014 20001            2
## MTL5  1  0  1   4   1   0 190.5  MTL      TOR 20132014 20001            3
##      gamestate home blocksfor blocksagainst missesfor missesagainst
## MTL          1    1         2             1         6             1
## MTL1         1    1         4             4         1             2
## MTL2         1    1         1             4         1             1
## MTL3         1    1         5             4         7             3
## MTL4         2    1         0             0         0             0
## MTL5         2    1         2             0         0             0

## More Bells And Whistles

plyr comes with a number of helper functions:

teambits <- summarize (teamtable, blocksfor = CF - FF, blocksagainst = CA - FA, missesfor = FF - SF, missesagainst = FA - SA)
head(teambits)
##   blocksfor blocksagainst missesfor missesagainst
## 1         2             1         6             1
## 2         4             4         1             2
## 3         1             4         1             1
## 4         5             4         7             3
## 5         0             0         0             0
## 6         2             0         0             0